Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Dec 27, 2024

Commit

300fe5d

1 Parent(s): d2df8da

updates through 4th iteration

Browse files

Files changed (22) hide show

.gitignore +4 -0
augmented_combined_dataset.json +0 -0
back_translator.py +53 -22
chatbot.py +261 -0
chatbot2.py +839 -0
chatbot3.py +824 -0
chatbot4.py +1291 -0
dialogue_augmenter.py +329 -171
main.py +3 -3
paraphraser.py +14 -3
pipeline_config.py +0 -1
processing_pipeline.py +61 -19
quality_metrics.py +10 -92
readme.md +4 -2
requirements.txt +12 -10
response_quality_checker.py +164 -0
run_model.py +162 -0
run_model2.py +340 -0
run_model3.py +434 -0
run_model4.py +237 -0
setup.py +61 -6
test_trained_model.py +0 -0

.gitignore CHANGED Viewed

@@ -159,3 +159,7 @@ datasets/*
 processed_outputs/*
 !processed_outputs/.gitkeep

 processed_outputs/*
 !processed_outputs/.gitkeep
+chatbot_training/
+checkpoints/
+.DS_Store

augmented_combined_dataset.json DELETED Viewed

The diff for this file is too large to render. See raw diff

back_translator.py CHANGED Viewed

@@ -3,6 +3,8 @@ from transformers import (
     MarianTokenizer,
 )
 class BackTranslator:
     """
     Perform Back-translation with pivot language. English -> German -> Spanish -> English
@@ -20,7 +22,7 @@ class BackTranslator:
         self.tokenizer_pivot_forward = MarianTokenizer.from_pretrained(pivot_forward_model_name)
         self.model_pivot_forward = MarianMTModel.from_pretrained(pivot_forward_model_name)
-        # Pivot translation model (German to Spanish)
         pivot_backward_model_name = f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}'
         self.tokenizer_pivot_backward = MarianTokenizer.from_pretrained(pivot_backward_model_name)
         self.model_pivot_backward = MarianMTModel.from_pretrained(pivot_backward_model_name)
@@ -29,28 +31,57 @@ class BackTranslator:
         backward_model_name = f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
         self.tokenizer_backward = MarianTokenizer.from_pretrained(backward_model_name)
         self.model_backward = MarianMTModel.from_pretrained(backward_model_name)
-    def back_translate(self, text):
-        """
-        Perform back-translation through German and Spanish to generate text variations.
-        Args:
-            text (str): The input text to be back-translated
-        Returns:
-            str: The back-translated text
-        """
-        # 1. English to German
-        encoded_pivot = self.tokenizer_pivot_forward([text], padding=True, truncation=True, return_tensors='pt')
-        generated_pivot = self.model_pivot_forward.generate(**encoded_pivot)
-        pivot_text = self.tokenizer_pivot_forward.batch_decode(generated_pivot, skip_special_tokens=True)[0]
-        # 2. German to Spanish
-        encoded_back_pivot = self.tokenizer_pivot_backward([pivot_text], padding=True, truncation=True, return_tensors='pt')
-        retranslated_pivot = self.model_pivot_backward.generate(**encoded_back_pivot)
-        tgt_text_back = self.tokenizer_pivot_backward.batch_decode(retranslated_pivot, skip_special_tokens=True)[0]
-        # 3. Spanish to English
-        encoded_back = self.tokenizer_backward([tgt_text_back], padding=True, truncation=True, return_tensors='pt')
-        retranslated = self.model_backward.generate(**encoded_back)
-        src_text = self.tokenizer_backward.batch_decode(retranslated, skip_special_tokens=True)[0]
-        return src_text

     MarianTokenizer,
 )
+# Retained for reference but removed from the final code.
+# This method did not seem helpful for this retrieval-based chatbot.
 class BackTranslator:
     """
     Perform Back-translation with pivot language. English -> German -> Spanish -> English
         self.tokenizer_pivot_forward = MarianTokenizer.from_pretrained(pivot_forward_model_name)
         self.model_pivot_forward = MarianMTModel.from_pretrained(pivot_forward_model_name)
+        # Pivot translation (German to Spanish)
         pivot_backward_model_name = f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}'
         self.tokenizer_pivot_backward = MarianTokenizer.from_pretrained(pivot_backward_model_name)
         self.model_pivot_backward = MarianMTModel.from_pretrained(pivot_backward_model_name)
         backward_model_name = f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
         self.tokenizer_backward = MarianTokenizer.from_pretrained(backward_model_name)
         self.model_backward = MarianMTModel.from_pretrained(backward_model_name)
+        # Set models to eval mode
+        self.model_pivot_forward.eval()
+        self.model_pivot_backward.eval()
+        self.model_backward.eval()
+    def back_translate(self, text, device=None):
+        try:
+            # Move models to device if specified
+            if device is not None:
+                self.model_pivot_forward = self.model_pivot_forward.to(device)
+                self.model_pivot_backward = self.model_pivot_backward.to(device)
+                self.model_backward = self.model_backward.to(device)
+            # Forward translation (English to German)
+            encoded_pivot = self.tokenizer_pivot_forward([text], padding=True,
+                                                       truncation=True, return_tensors='pt')
+            if device is not None:
+                encoded_pivot = {k: v.to(device) for k, v in encoded_pivot.items()}
+            generated_pivot = self.model_pivot_forward.generate(**encoded_pivot)
+            if device is not None:
+                generated_pivot = generated_pivot.cpu()
+            pivot_text = self.tokenizer_pivot_forward.batch_decode(generated_pivot,
+                                                                 skip_special_tokens=True)[0]
+            # Pivot translation (German to Spanish)
+            encoded_back_pivot = self.tokenizer_pivot_backward([pivot_text], padding=True,
+                                                             truncation=True, return_tensors='pt')
+            if device is not None:
+                encoded_back_pivot = {k: v.to(device) for k, v in encoded_back_pivot.items()}
+            retranslated_pivot = self.model_pivot_backward.generate(**encoded_back_pivot)
+            if device is not None:
+                retranslated_pivot = retranslated_pivot.cpu()
+            tgt_text_back = self.tokenizer_pivot_backward.batch_decode(retranslated_pivot,
+                                                                     skip_special_tokens=True)[0]
+            # Backward translation (Spanish to English)
+            encoded_back = self.tokenizer_backward([tgt_text_back], padding=True,
+                                                 truncation=True, return_tensors='pt')
+            if device is not None:
+                encoded_back = {k: v.to(device) for k, v in encoded_back.items()}
+            retranslated = self.model_backward.generate(**encoded_back)
+            if device is not None:
+                retranslated = retranslated.cpu()
+            src_text = self.tokenizer_backward.batch_decode(retranslated,
+                                                          skip_special_tokens=True)[0]
+            return src_text
+        except Exception as e:
+            print(f"Error in back translation: {e}")
+            return text

chatbot.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import numpy as np
+import tensorflow as tf
+import keras
+print(tf.__version__)
+print(keras.__version__)
+import spacy
+import random
+from tqdm import trange
+class RetrievalChatbot:
+    def __init__(
+        self,
+        vocab_size: int = 10000,
+        max_sequence_length: int = 80,
+        embedding_dim: int = 256,
+        lstm_units: int = 256,
+        num_attention_heads: int = 8,
+        margin: float = 0.3
+    ):
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.embedding_dim = embedding_dim
+        self.lstm_units = lstm_units
+        self.num_attention_heads = num_attention_heads
+        self.margin = margin
+        self.nlp = spacy.load('en_core_web_md')
+        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(
+            num_words=vocab_size,
+            oov_token="<OOV>"
+        )
+        self.query_encoder_model, self.response_encoder_model = self._build_encoders()
+    def _positional_encoding(self, position: int, d_model: int) -> tf.Tensor:
+        angles = np.arange(position)[:, np.newaxis] / np.power(
+            10000,
+            (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / d_model
+        )
+        sines = np.sin(angles[:, 0::2])
+        cosines = np.cos(angles[:, 1::2])
+        pos_encoding = np.concatenate([sines, cosines], axis=-1)
+        pos_encoding = pos_encoding[np.newaxis, ...]
+        return tf.cast(pos_encoding, dtype=tf.float32)
+    def _build_single_encoder(self, name_prefix: str):
+        input_layer = tf.keras.Input(shape=(self.max_sequence_length,), name=f"{name_prefix}_input")
+        embedding = tf.keras.layers.Embedding(
+            self.vocab_size,
+            self.embedding_dim,
+            mask_zero=True,
+            name=f"{name_prefix}_embedding"
+        )(input_layer)
+        pos_encoding = self._positional_encoding(self.max_sequence_length, self.embedding_dim)
+        x = embedding + pos_encoding
+        # # Multi-head attention
+        # attention_output = tf.keras.layers.MultiHeadAttention(
+        #     num_heads=self.num_attention_heads,
+        #     key_dim=self.embedding_dim // self.num_attention_heads
+        # )(x, x)
+        # x = tf.keras.layers.LayerNormalization()(x + attention_output)
+        for i in range(2):
+            lstm_out = tf.keras.layers.LSTM(
+                self.lstm_units,
+                return_sequences=True,
+                kernel_regularizer=tf.keras.regularizers.l2(0.01),
+                name=f"{name_prefix}_lstm_{i}"
+            )(x)
+            x = tf.keras.layers.LayerNormalization()(x + lstm_out)
+        encoder_output = tf.keras.layers.LSTM(
+            self.lstm_units,
+            name=f"{name_prefix}_final_lstm"
+        )(x)
+        encoder_output = tf.keras.layers.Dropout(0.2)(encoder_output)
+        encoder_output = tf.keras.layers.Lambda(lambda t: tf.nn.l2_normalize(t, axis=1))(encoder_output)
+        return tf.keras.Model(input_layer, encoder_output, name=f"{name_prefix}_encoder")
+    def _build_encoders(self):
+        query_encoder = self._build_single_encoder("query")
+        response_encoder = self._build_single_encoder("response")
+        return query_encoder, response_encoder
+    def _spacy_similarity(self, text1: str, text2: str) -> float:
+        doc1 = self.nlp(text1)
+        doc2 = self.nlp(text2)
+        print('doc1:', doc1)
+        print('doc2:', doc2)
+        print('doc1.similarity(doc2):', doc1.similarity(doc2))
+        return doc1.similarity(doc2)
+    def prepare_dataset(self, dialogues: list, neg_samples_per_pos=3):
+        # Create triplets: (query, positive, negative)
+        response_pool = [
+            turn['text'] for d in dialogues for turn in d['turns'] if turn['speaker'] == 'assistant'
+        ]
+        queries, positives, negatives = [], [], []
+        for dialogue in dialogues:
+            turns = dialogue['turns']
+            for i in range(0, len(turns)-1):
+                if turns[i]['speaker'] == 'user' and turns[i+1]['speaker'] == 'assistant':
+                    q = turns[i]['text']
+                    p = turns[i+1]['text']
+                    # Find negatives using spaCy similarity
+                    neg_candidates = []
+                    attempts = 0
+                    while len(neg_candidates) < neg_samples_per_pos and attempts < 200:
+                        cand = random.choice(response_pool)
+                        if cand != p:
+                            sim = self._spacy_similarity(cand, p)
+                            # Choose thresholds that produce hard negatives
+                            if 0.4 < sim < 0.9:
+                                neg_candidates.append(cand)
+                        attempts += 1
+                    if len(neg_candidates) == neg_samples_per_pos:
+                        for neg in neg_candidates:
+                            queries.append(q)
+                            positives.append(p)
+                            negatives.append(neg)
+        # Fit tokenizer
+        all_text = queries + positives + negatives
+        self.tokenizer.fit_on_texts(all_text)
+        def seq_pad(txts):
+            seq = self.tokenizer.texts_to_sequences(txts)
+            return tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=self.max_sequence_length, padding='post')
+        q_pad = seq_pad(queries)
+        p_pad = seq_pad(positives)
+        n_pad = seq_pad(negatives)
+        return q_pad, p_pad, n_pad
+    def triplet_loss(self, q_emb, p_emb, n_emb):
+        pos_dist = tf.reduce_sum(tf.square(q_emb - p_emb), axis=1)
+        neg_dist = tf.reduce_sum(tf.square(q_emb - n_emb), axis=1)
+        loss = tf.maximum(0.0, self.margin + pos_dist - neg_dist)
+        return tf.reduce_mean(loss)
+    def train_with_triplet_loss(
+        self, q_pad, p_pad, n_pad,
+        epochs=3,
+        batch_size=16,
+        validation_split=0.2,
+        early_stopping_patience=3,
+        use_tqdm=True
+    ):
+        train_losses = []
+        val_losses = []
+        total_samples = len(q_pad)
+        idxs = np.arange(total_samples)
+        np.random.shuffle(idxs)
+        train_size = int((1 - validation_split) * total_samples)
+        train_idxs = idxs[:train_size]
+        val_idxs = idxs[train_size:]
+        q_train, p_train, n_train = q_pad[train_idxs], p_pad[train_idxs], n_pad[train_idxs]
+        q_val, p_val, n_val = q_pad[val_idxs], p_pad[val_idxs], n_pad[val_idxs]
+        optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
+        best_val_loss = float('inf')
+        wait = 0
+        for epoch in range(epochs):
+            # Shuffle training data each epoch
+            perm = np.random.permutation(len(q_train))
+            q_train, p_train, n_train = q_train[perm], p_train[perm], n_train[perm]
+            num_batches = len(q_train) // batch_size
+            epoch_train_loss = 0.0
+            batch_iter = range(num_batches)
+            if use_tqdm:
+                batch_iter = trange(num_batches, desc=f"Epoch {epoch+1}/{epochs}")
+            for i in batch_iter:
+                q_batch = q_train[i*batch_size:(i+1)*batch_size]
+                p_batch = p_train[i*batch_size:(i+1)*batch_size]
+                n_batch = n_train[i*batch_size:(i+1)*batch_size]
+                with tf.GradientTape() as tape:
+                    q_emb = self.query_encoder_model(q_batch, training=True)
+                    p_emb = self.response_encoder_model(p_batch, training=True)
+                    n_emb = self.response_encoder_model(n_batch, training=True)
+                    loss = self.triplet_loss(q_emb, p_emb, n_emb)
+                grads = tape.gradient(
+                    loss,
+                    self.query_encoder_model.trainable_variables +
+                    self.response_encoder_model.trainable_variables
+                )
+                optimizer.apply_gradients(zip(
+                    grads,
+                    self.query_encoder_model.trainable_variables +
+                    self.response_encoder_model.trainable_variables
+                ))
+                epoch_train_loss += loss.numpy()
+            epoch_train_loss /= num_batches
+            # Validation loss
+            val_batches = len(q_val) // batch_size
+            epoch_val_loss = 0.0
+            for i in range(val_batches):
+                q_batch = q_val[i*batch_size:(i+1)*batch_size]
+                p_batch = p_val[i*batch_size:(i+1)*batch_size]
+                n_batch = n_val[i*batch_size:(i+1)*batch_size]
+                q_emb = self.query_encoder_model(q_batch, training=False)
+                p_emb = self.response_encoder_model(p_batch, training=False)
+                n_emb = self.response_encoder_model(n_batch, training=False)
+                v_loss = self.triplet_loss(q_emb, p_emb, n_emb)
+                epoch_val_loss += v_loss.numpy()
+            if val_batches > 0:
+                epoch_val_loss /= val_batches
+            train_losses.append(epoch_train_loss)
+            val_losses.append(epoch_val_loss)
+            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}")
+            # Early Stopping logic
+            if epoch_val_loss < best_val_loss:
+                best_val_loss = epoch_val_loss
+                wait = 0
+                # (Optional) Save best weights
+            else:
+                wait += 1
+                if wait >= early_stopping_patience:
+                    print("Early stopping triggered.")
+                    break
+        return train_losses, val_losses
+    def encode_texts(self, texts, is_query=True):
+        seq = self.tokenizer.texts_to_sequences(texts)
+        pad_seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=self.max_sequence_length, padding='post')
+        if is_query:
+            return self.query_encoder_model(pad_seq, training=False)
+        else:
+            return self.response_encoder_model(pad_seq, training=False)
+    def retrieve_top_n(self, query: str, candidates: list, top_n=5):
+        q_emb = self.encode_texts([query], is_query=True)  # shape (1, d)
+        c_emb = self.encode_texts(candidates, is_query=False)  # shape (num_cand, d)
+        sim = tf.matmul(q_emb, c_emb, transpose_b=True).numpy()[0]  # dot product similarity
+        top_indices = np.argsort(sim)[::-1][:top_n]
+        return [(candidates[i], sim[i]) for i in top_indices]

chatbot2.py ADDED Viewed

	@@ -0,0 +1,839 @@

+import numpy as np
+import tensorflow as tf
+import spacy
+import random
+from typing import List, Tuple, Dict, Optional, Union
+from dataclasses import dataclass
+from tqdm import tqdm
+import logging
+from pathlib import Path
+import json
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class ChatbotConfig:
+    """Configuration for the retrieval chatbot."""
+    vocab_size: int = 10000
+    max_sequence_length: int = 512
+    embedding_dim: int = 256
+    encoder_units: int = 256
+    num_attention_heads: int = 8
+    dropout_rate: float = 0.2
+    l2_reg_weight: float = 0.001
+    margin: float = 0.3
+    learning_rate: float = 0.001
+    min_text_length: int = 3      # Reduced from 10 to allow shorter responses
+    max_context_turns: int = 5
+    warmup_steps: int = 200
+    spacy_model: str = 'en_core_web_md'
+    def to_dict(self) -> dict:
+        """Convert config to dictionary."""
+        return {k: str(v) if isinstance(v, Path) else v
+                for k, v in self.__dict__.items()}
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> 'ChatbotConfig':
+        """Create config from dictionary."""
+        return cls(**{k: v for k, v in config_dict.items()
+                     if k in cls.__dataclass_fields__})
+class TransformerBlock(tf.keras.layers.Layer):
+    """Custom Transformer block with pre-layer normalization."""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        ff_dim: int,
+        dropout: float = 0.1,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.ff_dim = ff_dim
+        self.dropout = dropout
+        self.attention = tf.keras.layers.MultiHeadAttention(
+            num_heads=num_heads,
+            key_dim=embed_dim // num_heads,
+            dropout=dropout
+        )
+        self.ffn = tf.keras.Sequential([
+            tf.keras.layers.Dense(ff_dim, activation="gelu"),
+            tf.keras.layers.Dense(embed_dim),
+        ])
+        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = tf.keras.layers.Dropout(dropout)
+        self.dropout2 = tf.keras.layers.Dropout(dropout)
+    def call(self, inputs: tf.Tensor, training: bool, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
+        # Pre-layer normalization
+        norm_inputs = self.layernorm1(inputs)
+        # Self-attention
+        attention_output = self.attention(
+            query=norm_inputs,
+            value=norm_inputs,
+            key=norm_inputs,
+            attention_mask=mask,
+            training=training
+        )
+        attention_output = self.dropout1(attention_output, training=training)
+        attention_output = inputs + attention_output
+        # Feed-forward network
+        norm_attention = self.layernorm2(attention_output)
+        ffn_output = self.ffn(norm_attention)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        return attention_output + ffn_output
+    def get_config(self) -> dict:
+        config = super().get_config()
+        config.update({
+            "embed_dim": self.embed_dim,
+            "num_heads": self.num_heads,
+            "ff_dim": self.ff_dim,
+            "dropout": self.dropout,
+        })
+        return config
+class EncoderModel(tf.keras.Model):
+    """Dual encoder model with shared weights option."""
+    def __init__(
+        self,
+        config: ChatbotConfig,
+        name: str = "encoder",
+        shared_weights: bool = False,
+        **kwargs
+    ):
+        super().__init__(name=name, **kwargs)
+        self.config = config
+        self.shared_weights = shared_weights
+        # Input embedding layer
+        self.embedding = tf.keras.layers.Embedding(
+            config.vocab_size,
+            config.embedding_dim,
+            mask_zero=True,
+            name=f"{name}_embedding"
+        )
+        # Positional encoding
+        self.pos_encoding = self._get_positional_encoding()
+        # Transformer blocks
+        self.transformer_blocks = [
+            TransformerBlock(
+                config.embedding_dim,
+                config.num_attention_heads,
+                config.encoder_units * 4,
+                config.dropout_rate,
+                name=f"{name}_transformer_{i}"
+            ) for i in range(3)
+        ]
+        # Final LSTM layer
+        self.final_lstm = tf.keras.layers.LSTM(
+            config.encoder_units,
+            kernel_regularizer=tf.keras.regularizers.l2(config.l2_reg_weight),
+            name=f"{name}_final_lstm"
+        )
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+        self.normalize = tf.keras.layers.Lambda(
+            lambda x: tf.nn.l2_normalize(x, axis=1)
+        )
+    def _get_positional_encoding(self) -> tf.Tensor:
+        """Generate positional encoding matrix."""
+        pos = np.arange(self.config.max_sequence_length)[:, np.newaxis]
+        i = np.arange(self.config.embedding_dim)[np.newaxis, :]
+        angle = pos / np.power(10000, (2 * (i // 2)) / self.config.embedding_dim)
+        pos_encoding = np.zeros_like(angle)
+        pos_encoding[:, 0::2] = np.sin(angle[:, 0::2])
+        pos_encoding[:, 1::2] = np.cos(angle[:, 1::2])
+        return tf.cast(pos_encoding[np.newaxis, ...], dtype=tf.float32)
+    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
+        # Get input mask
+        mask = self.embedding.compute_mask(inputs)
+        mask = mask[:, tf.newaxis, tf.newaxis, :]  # Add attention dims
+        # Embedding + positional encoding
+        x = self.embedding(inputs)
+        x = x + self.pos_encoding
+        # Apply transformer blocks
+        for transformer_block in self.transformer_blocks:
+            x = transformer_block(x, training=training, mask=mask)
+        # Final processing
+        x = self.final_lstm(x)
+        x = self.dropout(x, training=training)
+        return self.normalize(x)
+class RetrievalChatbot:
+    """Professional implementation of a retrieval-based chatbot."""
+    def __init__(self, config: ChatbotConfig):
+        self.config = config
+        self.nlp = spacy.load(config.spacy_model)
+        # Initialize tokenizer
+        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(
+            num_words=config.vocab_size,
+            oov_token="<OOV>"
+        )
+        # Special tokens
+        self.special_tokens = {
+            "user": "<USER>",
+            "assistant": "<ASSISTANT>",
+            "context": "<CONTEXT>",
+            "sep": "<SEP>"
+        }
+        # Build models
+        self._build_models()
+        # Training history
+        self.history = {
+            "train_loss": [],
+            "val_loss": [],
+            "train_metrics": {},
+            "val_metrics": {}
+        }
+        # Initialize similarity cache
+        self.similarity_cache = {}
+    def _build_models(self):
+        """Initialize the encoder models."""
+        # Query encoder
+        self.query_encoder = EncoderModel(
+            self.config,
+            name="query_encoder",
+            shared_weights=False
+        )
+        # Response encoder (can share weights with query encoder)
+        self.response_encoder = EncoderModel(
+            self.config,
+            name="response_encoder",
+            shared_weights=False
+        )
+    def save_models(self, save_dir: Union[str, Path]):
+        """Save models and configuration."""
+        save_dir = Path(save_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        # Save config
+        with open(save_dir / "config.json", "w") as f:
+            json.dump(self.config.to_dict(), f, indent=2)
+        # Save models with proper extension
+        self.query_encoder.save(save_dir / "query_encoder.keras")
+        self.response_encoder.save(save_dir / "response_encoder.keras")
+        # Save tokenizer config
+        tokenizer_config = {
+            "word_index": self.tokenizer.word_index,
+            "word_counts": self.tokenizer.word_counts,
+            "document_count": self.tokenizer.document_count,
+            "index_docs": self.tokenizer.index_docs,
+            "index_word": self.tokenizer.index_word
+        }
+        with open(save_dir / "tokenizer_config.json", "w") as f:
+            json.dump(tokenizer_config, f)
+    @classmethod
+    def load_models(cls, load_dir: Union[str, Path]) -> 'RetrievalChatbot':
+        """Load saved models and configuration."""
+        load_dir = Path(load_dir)
+        # Load config
+        with open(load_dir / "config.json", "r") as f:
+            config = ChatbotConfig.from_dict(json.load(f))
+        # Initialize chatbot
+        chatbot = cls(config)
+        # Load models with proper extension
+        chatbot.query_encoder = tf.keras.models.load_model(
+            load_dir / "query_encoder.keras",
+            custom_objects={"TransformerBlock": TransformerBlock}
+        )
+        chatbot.response_encoder = tf.keras.models.load_model(
+            load_dir / "response_encoder.keras",
+            custom_objects={"TransformerBlock": TransformerBlock}
+        )
+        # Load tokenizer config
+        with open(load_dir / "tokenizer_config.json", "r") as f:
+            tokenizer_config = json.load(f)
+        chatbot.tokenizer = tf.keras.preprocessing.text.Tokenizer(
+            num_words=config.vocab_size,
+            oov_token="<OOV>"
+        )
+        chatbot.tokenizer.word_index = tokenizer_config["word_index"]
+        chatbot.tokenizer.word_counts = tokenizer_config["word_counts"]
+        chatbot.tokenizer.document_count = tokenizer_config["document_count"]
+        chatbot.tokenizer.index_docs = tokenizer_config["index_docs"]
+        chatbot.tokenizer.index_word = tokenizer_config["index_word"]
+        return chatbot
+    def _improved_spacy_similarity(self, text1: str, text2: str) -> float:
+        """Calculate semantic similarity between texts with preprocessing."""
+        def preprocess(text: str) -> str:
+            # Basic cleaning
+            text = ' '.join(text.split())
+            return text if text.strip() else "empty_document"
+        # Get cache key
+        cache_key = f"{hash(text1)}_{hash(text2)}"
+        if cache_key in self.similarity_cache:
+            return self.similarity_cache[cache_key]
+        # Process texts
+        text1, text2 = preprocess(text1), preprocess(text2)
+        doc1, doc2 = self.nlp(text1), self.nlp(text2)
+        # Calculate similarity
+        if doc1.has_vector and doc2.has_vector:
+            sim = doc1.similarity(doc2)
+        else:
+            # Fallback to token overlap similarity
+            tokens1 = {t.lower_ for t in doc1 if not t.is_stop and not t.is_punct}
+            tokens2 = {t.lower_ for t in doc2 if not t.is_stop and not t.is_punct}
+            intersection = len(tokens1.intersection(tokens2))
+            union = len(tokens1.union(tokens2))
+            sim = intersection / union if union > 0 else 0.0
+        # Cache result
+        self.similarity_cache[cache_key] = sim
+        return sim
+    def _smart_negative_sampling(
+        self,
+        positive: str,
+        response_pool: List[str],
+        n_samples: int,
+        max_attempts: int = 200,
+        similarity_bounds: Tuple[float, float] = (0.3, 0.8),
+        batch_size: int = 10
+    ) -> List[str]:
+        """Smart negative sampling with similarity bounds and batching."""
+        candidates = []
+        seen = set()
+        attempts = 0
+        while len(candidates) < n_samples and attempts < max_attempts:
+            # Batch process candidates
+            batch = random.sample(
+                response_pool,
+                min(batch_size, max_attempts - attempts)
+            )
+            for candidate in batch:
+                if candidate != positive and candidate not in seen:
+                    seen.add(candidate)
+                    sim = self._improved_spacy_similarity(candidate, positive)
+                    # Check similarity bounds
+                    if similarity_bounds[0] < sim < similarity_bounds[1]:
+                        candidates.append(candidate)
+                        if len(candidates) == n_samples:
+                            break
+            attempts += len(batch)
+        return candidates
+    def train(
+        self,
+        q_pad: tf.Tensor,
+        p_pad: tf.Tensor,
+        n_pad: tf.Tensor,
+        epochs: int = 3,
+        batch_size: int = 32,
+        validation_split: float = 0.2,
+        checkpoint_dir: Optional[Union[str, Path]] = None
+    ):
+        """Train the model with improved training loop."""
+        # Setup training
+        total_samples = len(q_pad)
+        train_size = int((1 - validation_split) * total_samples)
+        # Split data
+        indices = np.random.permutation(total_samples)
+        train_idx, val_idx = indices[:train_size], indices[train_size:]
+        train_data = (q_pad[train_idx], p_pad[train_idx], n_pad[train_idx])
+        val_data = (q_pad[val_idx], p_pad[val_idx], n_pad[val_idx])
+        # Setup optimizer with learning rate schedule
+        steps_per_epoch = train_size // batch_size
+        total_steps = steps_per_epoch * epochs
+        lr_schedule = self._get_lr_schedule(
+            total_steps,
+            self.config.learning_rate,
+            self.config.warmup_steps
+        )
+        optimizer = tf.keras.optimizers.Adam(lr_schedule)
+        # Setup checkpointing
+        if checkpoint_dir:
+            checkpoint_dir = Path(checkpoint_dir)
+            checkpoint_dir.mkdir(parents=True, exist_ok=True)
+            # Setup checkpoint callback with correct file format
+            checkpoint_template = str(checkpoint_dir / "model_epoch_{epoch:04d}.weights.h5")
+            checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+                checkpoint_template,
+                save_weights_only=True,
+                save_best_only=True,
+                monitor='val_loss',
+                mode='min',
+                verbose=1
+            )
+        # Training loop
+        best_val_loss = float('inf')
+        patience = 5
+        wait = 0
+        for epoch in range(epochs):
+            # Training
+            train_loss = self._train_epoch(
+                train_data,
+                optimizer,
+                batch_size,
+                training=True
+            )
+            # Validation
+            val_loss = self._train_epoch(
+                val_data,
+                optimizer,
+                batch_size,
+                training=False
+            )
+            # Update history
+            self.history['train_loss'].append(train_loss)
+            self.history['val_loss'].append(val_loss)
+            logger.info(
+                f"Epoch {epoch + 1}/{epochs} - "
+                f"train_loss: {train_loss:.4f} - "
+                f"val_loss: {val_loss:.4f}"
+            )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                wait = 0
+                if checkpoint_dir:
+                    self.save_models(checkpoint_dir / f"best_model")
+            else:
+                wait += 1
+                if wait >= patience:
+                    logger.info("Early stopping triggered")
+                    break
+    def _get_lr_schedule(
+        self,
+        total_steps: int,
+        peak_lr: float,
+        warmup_steps: int
+    ) -> tf.keras.optimizers.schedules.LearningRateSchedule:
+        """Enhanced learning rate schedule with better error handling and logging."""
+        class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+            def __init__(
+                self,
+                total_steps: int,
+                peak_lr: float,
+                warmup_steps: int
+            ):
+                super().__init__()
+                self.total_steps = tf.cast(total_steps, tf.float32)
+                self.peak_lr = tf.cast(peak_lr, tf.float32)
+                self.warmup_steps = tf.cast(max(1, warmup_steps), tf.float32)  # Prevent 0
+                # Calculate and store constants
+                self.initial_lr = self.peak_lr * 0.1  # Start at 10% of peak
+                self.min_lr = self.peak_lr * 0.01     # Minimum 1% of peak
+                logger.info(f"Learning rate schedule initialized:")
+                logger.info(f"  Initial LR: {float(self.initial_lr):.6f}")
+                logger.info(f"  Peak LR: {float(self.peak_lr):.6f}")
+                logger.info(f"  Min LR: {float(self.min_lr):.6f}")
+                logger.info(f"  Warmup steps: {int(self.warmup_steps)}")
+                logger.info(f"  Total steps: {int(self.total_steps)}")
+            def __call__(self, step):
+                step = tf.cast(step, tf.float32)
+                # Warmup phase
+                warmup_factor = tf.minimum(1.0, step / self.warmup_steps)
+                warmup_lr = self.initial_lr + (self.peak_lr - self.initial_lr) * warmup_factor
+                # Decay phase
+                decay_steps = tf.maximum(1.0, self.total_steps - self.warmup_steps)
+                decay_factor = (step - self.warmup_steps) / decay_steps
+                decay_factor = tf.minimum(tf.maximum(0.0, decay_factor), 1.0)  # Clip to [0,1]
+                cosine_decay = 0.5 * (1.0 + tf.cos(tf.constant(np.pi) * decay_factor))
+                decay_lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
+                # Choose between warmup and decay
+                final_lr = tf.where(step < self.warmup_steps, warmup_lr, decay_lr)
+                # Ensure learning rate is valid
+                final_lr = tf.maximum(self.min_lr, final_lr)
+                final_lr = tf.where(tf.math.is_finite(final_lr), final_lr, self.min_lr)
+                return final_lr
+            def get_config(self):
+                return {
+                    "total_steps": self.total_steps,
+                    "peak_lr": self.peak_lr,
+                    "warmup_steps": self.warmup_steps,
+                }
+        return CustomSchedule(total_steps, peak_lr, warmup_steps)
+    @tf.function
+    def _train_step(
+        self,
+        q_batch: tf.Tensor,
+        p_batch: tf.Tensor,
+        n_batch: tf.Tensor,
+        optimizer: tf.keras.optimizers.Optimizer,
+        training: bool = True
+    ) -> tf.Tensor:
+        """Single training step with triplet loss."""
+        with tf.GradientTape() as tape:
+            # Get embeddings
+            q_emb = self.query_encoder(q_batch, training=training)
+            p_emb = self.response_encoder(p_batch, training=training)
+            n_emb = self.response_encoder(n_batch, training=training)
+            # Calculate triplet loss
+            pos_dist = tf.reduce_sum(tf.square(q_emb - p_emb), axis=1)
+            neg_dist = tf.reduce_sum(tf.square(q_emb - n_emb), axis=1)
+            loss = tf.maximum(0.0, self.config.margin + pos_dist - neg_dist)
+            loss = tf.reduce_mean(loss)
+        if training:
+            # Apply gradients
+            gradients = tape.gradient(
+                loss,
+                self.query_encoder.trainable_variables +
+                self.response_encoder.trainable_variables
+            )
+            optimizer.apply_gradients(zip(
+                gradients,
+                self.query_encoder.trainable_variables +
+                self.response_encoder.trainable_variables
+            ))
+        return loss
+    def _train_epoch(
+        self,
+        data: Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
+        optimizer: tf.keras.optimizers.Optimizer,
+        batch_size: int,
+        training: bool = True
+    ) -> float:
+        """Train for one epoch with enhanced logging and progress tracking."""
+        q_data, p_data, n_data = data
+        total_loss = 0
+        num_batches = len(q_data) // batch_size
+        # Log current learning rate at start of epoch
+        if training:
+            if hasattr(optimizer.learning_rate, '__call__'):
+                current_lr = optimizer.learning_rate(optimizer.iterations)
+            else:
+                current_lr = optimizer.learning_rate
+            logger.info(f"Current learning rate: {float(current_lr):.6f}")
+            # Shuffle data
+            indices = np.random.permutation(len(q_data))
+            q_data = q_data[indices]
+            p_data = p_data[indices]
+            n_data = n_data[indices]
+        # Create progress bar
+        mode = "Training" if training else "Validation"
+        pbar = tqdm(
+            total=num_batches,
+            desc=f"{mode} batches",
+            unit="batch",
+            dynamic_ncols=True  # Automatically adjust width
+        )
+        # Process batches
+        for i in range(num_batches):
+            start_idx = i * batch_size
+            end_idx = start_idx + batch_size
+            batch_loss = self._train_step(
+                q_data[start_idx:end_idx],
+                p_data[start_idx:end_idx],
+                n_data[start_idx:end_idx],
+                optimizer,
+                training
+            )
+            total_loss += batch_loss
+            # Update progress bar with current loss
+            avg_loss = total_loss / (i + 1)
+            pbar.set_postfix({
+                'loss': f'{avg_loss:.4f}',
+                'lr': f'{float(current_lr):.6f}' if training else 'N/A'
+            })
+            pbar.update(1)
+        pbar.close()
+        return total_loss / num_batches if num_batches > 0 else 0
+    def _prepare_sequences(
+        self,
+        queries: List[str],
+        positives: List[str],
+        negatives: List[str]
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        """Enhanced sequence preparation with logging and text preprocessing."""
+        logger.info("Preparing sequences...")
+        # Text cleaning function from old version
+        def clean_text(text: str) -> str:
+            # Remove excessive whitespace
+            text = ' '.join(text.split())
+            # Remove very long repetitive sequences
+            if len(text) > 500:  # Add length limit
+                text = ' '.join(dict.fromkeys(text.split()))
+            return text
+        # Process texts with special tokens and cleaning
+        queries = [f"{self.special_tokens['user']} {clean_text(q)}" for q in queries]
+        positives = [f"{self.special_tokens['assistant']} {clean_text(p)}" for p in positives]
+        negatives = [f"{self.special_tokens['assistant']} {clean_text(n)}" for n in negatives]
+        # Fit tokenizer and log vocabulary statistics
+        all_texts = queries + positives + negatives
+        self.tokenizer.fit_on_texts(all_texts)
+        # Log vocabulary statistics
+        vocab_size = len(self.tokenizer.word_index)
+        logger.info(f"Vocabulary statistics:")
+        logger.info(f"  Total unique tokens: {vocab_size}")
+        logger.info(f"  Vocab limit: {self.config.vocab_size}")
+        # Log most common tokens
+        word_freq = sorted(
+            self.tokenizer.word_counts.items(),
+            key=lambda x: x[1],
+            reverse=True
+        )[:10]
+        logger.info("Most common tokens:")
+        for word, freq in word_freq:
+            logger.info(f"  {word}: {freq}")
+        # Padding function from old version
+        def pad_sequences(texts: List[str]) -> tf.Tensor:
+            sequences = self.tokenizer.texts_to_sequences(texts)
+            return tf.keras.preprocessing.sequence.pad_sequences(
+                sequences,
+                maxlen=self.config.max_sequence_length,
+                padding='post',
+                truncating='post'
+            )
+        # Return padded sequences
+        return (
+            pad_sequences(queries),
+            pad_sequences(positives),
+            pad_sequences(negatives)
+        )
+    def prepare_dataset(
+        self,
+        dialogues: List[dict],
+        neg_samples_per_pos: int = 3,
+        debug_samples: Optional[int] = None
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        """Prepare dataset with enhanced logging and statistics."""
+        logger.info("Preparing dataset...")
+        # Log dataset statistics
+        total_dialogues = len(dialogues)
+        total_turns = sum(len(d['turns']) for d in dialogues)
+        avg_turns = total_turns / total_dialogues
+        logger.info(f"Dataset statistics:")
+        logger.info(f"  Total dialogues: {total_dialogues}")
+        logger.info(f"  Total turns: {total_turns}")
+        logger.info(f"  Average turns per dialogue: {avg_turns:.2f}")
+        # Extract and filter responses with logging
+        response_pool = []
+        skipped_short = 0
+        skipped_long = 0
+        for d in dialogues:
+            for turn in d['turns']:
+                if turn['speaker'] == 'assistant':
+                    text = turn['text'].strip()
+                    length = len(text.split())
+                    if length < self.config.min_text_length:
+                        skipped_short += 1
+                        continue
+                    if length > self.config.max_sequence_length:
+                        skipped_long += 1
+                        continue
+                    response_pool.append(text)
+        logger.info(f"Response pool statistics:")
+        logger.info(f"  Total responses: {len(response_pool)}")
+        logger.info(f"  Skipped (too short): {skipped_short}")
+        logger.info(f"  Skipped (too long): {skipped_long}")
+        # Process dialogues and create training examples
+        queries, positives, negatives = [], [], []
+        for dialogue in tqdm(dialogues, desc="Processing dialogues"):
+            turns = dialogue['turns']
+            for i in range(len(turns) - 1):
+                if turns[i]['speaker'] == 'user' and turns[i+1]['speaker'] == 'assistant':
+                    query = turns[i]['text'].strip()
+                    positive = turns[i+1]['text'].strip()
+                    # Skip short texts
+                    if (len(query.split()) < self.config.min_text_length or
+                        len(positive.split()) < self.config.min_text_length):  # Fixed
+                        continue
+                    # Get negative samples
+                    neg_samples = self._smart_negative_sampling(
+                        positive,
+                        response_pool,
+                        neg_samples_per_pos
+                    )
+                    if len(neg_samples) == neg_samples_per_pos:
+                        for neg in neg_samples:
+                            queries.append(query)
+                            positives.append(positive)
+                            negatives.append(neg)
+        # Log final dataset statistics
+        logger.info(f"Final dataset statistics:")
+        logger.info(f"  Training examples: {len(queries)}")
+        logger.info(f"  Unique queries: {len(set(queries))}")
+        logger.info(f"  Unique responses: {len(set(positives))}")
+        return self._prepare_sequences(queries, positives, negatives)
+    def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> tf.Tensor:
+        """Encode a query with optional conversation context."""
+        # Prepare query with context
+        if context:
+            context_str = ' '.join([
+                f"{self.special_tokens['user']} {q} "
+                f"{self.special_tokens['assistant']} {r}"
+                for q, r in context[-self.config.max_context_turns:]
+            ])
+            query = f"{context_str} {self.special_tokens['user']} {query}"
+        else:
+            query = f"{self.special_tokens['user']} {query}"
+        # Tokenize and pad
+        seq = self.tokenizer.texts_to_sequences([query])
+        padded_seq = tf.keras.preprocessing.sequence.pad_sequences(
+            seq,
+            maxlen=self.config.max_sequence_length,
+            padding='post',
+            truncating='post'
+        )
+        return self.query_encoder(padded_seq, training=False)
+    def encode_responses(self, responses: List[str]) -> tf.Tensor:
+        """Encode a batch of responses."""
+        # Prepare responses
+        responses = [
+            f"{self.special_tokens['assistant']} {r}"
+            for r in responses
+        ]
+        # Tokenize and pad
+        sequences = self.tokenizer.texts_to_sequences(responses)
+        padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
+            sequences,
+            maxlen=self.config.max_sequence_length,
+            padding='post',
+            truncating='post'
+        )
+        return self.response_encoder(padded_sequences, training=False)
+    def retrieve_responses(
+        self,
+        query: str,
+        candidates: List[str],
+        context: Optional[List[Tuple[str, str]]] = None,
+        top_k: int = 5
+    ) -> List[Tuple[str, float]]:
+        """Retrieve top-k responses for a query."""
+        # Encode query and candidates
+        q_emb = self.encode_query(query, context)
+        c_emb = self.encode_responses(candidates)
+        # Calculate similarities
+        similarities = tf.matmul(q_emb, c_emb, transpose_b=True).numpy()[0]
+        # Get top-k responses
+        top_indices = np.argsort(similarities)[::-1][:top_k]
+        return [(candidates[i], similarities[i]) for i in top_indices]
+    def chat(
+        self,
+        query: str,
+        response_pool: List[str],
+        conversation_history: Optional[List[Tuple[str, str]]] = None,
+        top_k: int = 5
+    ) -> Tuple[str, List[Tuple[str, float]]]:
+        """Interactive chat with response selection."""
+        # Get responses with scores
+        responses = self.retrieve_responses(
+            query,
+            response_pool,
+            conversation_history,
+            top_k
+        )
+        # Return best response and all candidates with scores
+        return responses[0][0], responses

chatbot3.py ADDED Viewed

	@@ -0,0 +1,824 @@

+from transformers import TFAutoModel, AutoTokenizer
+import tensorflow as tf
+import numpy as np
+from typing import List, Tuple, Dict, Optional, Union
+from dataclasses import dataclass
+import logging
+import spacy
+import random
+import json
+from tqdm import tqdm
+from pathlib import Path
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class ChatbotConfig:
+    """Enhanced configuration with pretrained model settings."""
+    vocab_size: int = 10000
+    max_sequence_length: int = 512
+    embedding_dim: int = 768  # Match DistilBERT's dimension
+    encoder_units: int = 256
+    num_attention_heads: int = 8
+    dropout_rate: float = 0.2
+    l2_reg_weight: float = 0.001
+    margin: float = 0.3
+    learning_rate: float = 0.001
+    min_text_length: int = 3
+    max_context_turns: int = 5
+    warmup_steps: int = 200
+    pretrained_model: str = 'distilbert-base-uncased'
+    freeze_embeddings: bool = True
+    spacy_model: str = 'en_core_web_md'
+    def to_dict(self) -> dict:
+        """Convert config to dictionary."""
+        return {k: str(v) if isinstance(v, Path) else v
+                for k, v in self.__dict__.items()}
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> 'ChatbotConfig':
+        """Create config from dictionary."""
+        return cls(**{k: v for k, v in config_dict.items()
+                     if k in cls.__dataclass_fields__})
+class TransformerBlock(tf.keras.layers.Layer):
+    """Custom Transformer block with pre-layer normalization."""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        ff_dim: int,
+        dropout: float = 0.1,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.ff_dim = ff_dim
+        self.dropout = dropout
+        self.attention = tf.keras.layers.MultiHeadAttention(
+            num_heads=num_heads,
+            key_dim=embed_dim // num_heads,
+            dropout=dropout
+        )
+        self.ffn = tf.keras.Sequential([
+            tf.keras.layers.Dense(ff_dim, activation="gelu"),
+            tf.keras.layers.Dense(embed_dim),
+        ])
+        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = tf.keras.layers.Dropout(dropout)
+        self.dropout2 = tf.keras.layers.Dropout(dropout)
+    def call(self, inputs: tf.Tensor, training: bool, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
+        # Pre-layer normalization
+        norm_inputs = self.layernorm1(inputs)
+        # Self-attention
+        attention_output = self.attention(
+            query=norm_inputs,
+            value=norm_inputs,
+            key=norm_inputs,
+            attention_mask=mask,
+            training=training
+        )
+        attention_output = self.dropout1(attention_output, training=training)
+        attention_output = inputs + attention_output
+        # Feed-forward network
+        norm_attention = self.layernorm2(attention_output)
+        ffn_output = self.ffn(norm_attention)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        return attention_output + ffn_output
+    def get_config(self) -> dict:
+        config = super().get_config()
+        config.update({
+            "embed_dim": self.embed_dim,
+            "num_heads": self.num_heads,
+            "ff_dim": self.ff_dim,
+            "dropout": self.dropout,
+        })
+        return config
+class EncoderModel(tf.keras.Model):
+    """Dual encoder model with pretrained embeddings."""
+    def __init__(
+        self,
+        config: ChatbotConfig,
+        name: str = "encoder",
+        shared_weights: bool = False,
+        **kwargs
+    ):
+        super().__init__(name=name, **kwargs)
+        self.config = config
+        self.shared_weights = shared_weights
+        # Load pretrained model and tokenizer
+        self.pretrained = TFAutoModel.from_pretrained(config.pretrained_model)
+        # Freeze pretrained weights if specified
+        if config.freeze_embeddings:
+            self.pretrained.trainable = False
+        # Transformer blocks for additional processing
+        self.transformer_blocks = [
+            TransformerBlock(
+                config.embedding_dim,
+                config.num_attention_heads,
+                config.encoder_units * 4,
+                config.dropout_rate,
+                name=f"{name}_transformer_{i}"
+            ) for i in range(2)  # Reduced number of blocks since we're using pretrained
+        ]
+        # Final LSTM layer
+        self.final_lstm = tf.keras.layers.LSTM(
+            config.encoder_units,
+            kernel_regularizer=tf.keras.regularizers.l2(config.l2_reg_weight),
+            name=f"{name}_final_lstm"
+        )
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+        self.normalize = tf.keras.layers.Lambda(
+            lambda x: tf.nn.l2_normalize(x, axis=1)
+        )
+    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
+        # Get pretrained embeddings
+        pretrained_outputs = self.pretrained(inputs, training=training)
+        x = pretrained_outputs.last_hidden_state
+        # Get attention mask from input
+        attention_mask = tf.cast(tf.not_equal(inputs, 0), tf.float32)
+        attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+        # Apply transformer blocks
+        for transformer_block in self.transformer_blocks:
+            x = transformer_block(x, training=training, mask=attention_mask)
+        # Final processing
+        x = self.final_lstm(x)
+        x = self.dropout(x, training=training)
+        return self.normalize(x)
+class RetrievalChatbot:
+    """Modified chatbot using pretrained embeddings with full functionality."""
+    def __init__(self, config: ChatbotConfig):
+        self.config = config
+        self.nlp = spacy.load(config.spacy_model)
+        # Use HuggingFace tokenizer instead of Keras
+        self.tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
+        # Special tokens
+        self.special_tokens = {
+            "user": "<USER>",
+            "assistant": "<ASSISTANT>",
+            "context": "<CONTEXT>",
+            "sep": "<SEP>"
+        }
+        # Add special tokens to tokenizer
+        self.tokenizer.add_special_tokens(
+            {'additional_special_tokens': list(self.special_tokens.values())}
+        )
+        # Build models
+        self._build_models()
+        # Initialize training tracking
+        self.history = {
+            "train_loss": [],
+            "val_loss": [],
+            "train_metrics": {},
+            "val_metrics": {}
+        }
+        self.similarity_cache = {}
+    def _build_models(self):
+        """Initialize the encoder models."""
+        # Query encoder
+        self.query_encoder = EncoderModel(
+            self.config,
+            name="query_encoder",
+            shared_weights=False
+        )
+        # Response encoder (can share weights with query encoder)
+        self.response_encoder = EncoderModel(
+            self.config,
+            name="response_encoder",
+            shared_weights=False
+        )
+        # Resize token embeddings to match the tokenizer's vocab size
+        new_vocab_size = len(self.tokenizer)
+        self.query_encoder.pretrained.resize_token_embeddings(new_vocab_size)
+        self.response_encoder.pretrained.resize_token_embeddings(new_vocab_size)
+    def save_models(self, save_dir: Union[str, Path]):
+        """Save models and configuration."""
+        save_dir = Path(save_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        # Save config
+        with open(save_dir / "config.json", "w") as f:
+            json.dump(self.config.to_dict(), f, indent=2)
+        # Save models
+        self.query_encoder.pretrained.save_pretrained(save_dir / "query_encoder")
+        self.response_encoder.pretrained.save_pretrained(save_dir / "response_encoder")
+        # Save tokenizer
+        self.tokenizer.save_pretrained(save_dir / "tokenizer")
+    @classmethod
+    def load_models(cls, load_dir: Union[str, Path]) -> 'RetrievalChatbot':
+        """Load saved models and configuration."""
+        load_dir = Path(load_dir)
+        # Load config
+        with open(load_dir / "config.json", "r") as f:
+            config = ChatbotConfig.from_dict(json.load(f))
+        # Initialize chatbot
+        chatbot = cls(config)
+        # Load models
+        chatbot.query_encoder.pretrained = TFAutoModel.from_pretrained(
+            load_dir / "query_encoder",
+            config=config
+        )
+        chatbot.response_encoder.pretrained = TFAutoModel.from_pretrained(
+            load_dir / "response_encoder",
+            config=config
+        )
+        # Load tokenizer
+        chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
+        return chatbot
+    def _improved_spacy_similarity(self, text1: str, text2: str) -> float:
+        """Calculate semantic similarity between texts with preprocessing."""
+        def preprocess(text: str) -> str:
+            # Basic cleaning
+            text = ' '.join(text.split())
+            return text if text.strip() else "empty_document"
+        # Get cache key
+        cache_key = f"{hash(text1)}_{hash(text2)}"
+        if cache_key in self.similarity_cache:
+            return self.similarity_cache[cache_key]
+        # Process texts
+        text1, text2 = preprocess(text1), preprocess(text2)
+        doc1, doc2 = self.nlp(text1), self.nlp(text2)
+        # Calculate similarity
+        if doc1.has_vector and doc2.has_vector:
+            sim = doc1.similarity(doc2)
+        else:
+            # Fallback to token overlap similarity
+            tokens1 = {t.lower_ for t in doc1 if not t.is_stop and not t.is_punct}
+            tokens2 = {t.lower_ for t in doc2 if not t.is_stop and not t.is_punct}
+            intersection = len(tokens1.intersection(tokens2))
+            union = len(tokens1.union(tokens2))
+            sim = intersection / union if union > 0 else 0.0
+        # Cache result
+        self.similarity_cache[cache_key] = sim
+        return sim
+    def prepare_dataset(
+        self,
+        dialogues: List[dict],
+        neg_samples_per_pos: int = 3,
+        debug_samples: Optional[int] = None
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        """Prepare dataset with enhanced logging and statistics."""
+        logger.info("Preparing dataset...")
+        # Apply debug_samples limit if specified
+        if debug_samples is not None:
+            dialogues = dialogues[:debug_samples]
+            logger.info(f"Debug mode: Limited to {debug_samples} dialogues")
+        # Log dataset statistics
+        total_dialogues = len(dialogues)
+        total_turns = sum(len(d['turns']) for d in dialogues)
+        avg_turns = total_turns / total_dialogues if total_dialogues > 0 else 0
+        logger.info(f"Dataset statistics:")
+        logger.info(f"  Total dialogues: {total_dialogues}")
+        logger.info(f"  Total turns: {total_turns}")
+        logger.info(f"  Average turns per dialogue: {avg_turns:.2f}")
+        # Extract and filter responses with logging
+        response_pool = []
+        skipped_short = 0
+        skipped_long = 0
+        for d in dialogues:
+            for turn in d['turns']:
+                if turn.get('speaker') == 'assistant' and 'text' in turn:
+                    text = turn['text'].strip()
+                    length = len(text.split())
+                    if length < self.config.min_text_length:
+                        skipped_short += 1
+                        continue
+                    if length > self.config.max_sequence_length:
+                        skipped_long += 1
+                        continue
+                    response_pool.append(text)
+        logger.info(f"Response pool statistics:")
+        logger.info(f"  Total responses: {len(response_pool)}")
+        logger.info(f"  Skipped (too short): {skipped_short}")
+        logger.info(f"  Skipped (too long): {skipped_long}")
+        # Process dialogues and create training examples
+        queries, positives, negatives = [], [], []
+        for dialogue in tqdm(dialogues, desc="Processing dialogues"):
+            turns = dialogue.get('turns', [])
+            for i in range(len(turns) - 1):
+                current_turn = turns[i]
+                next_turn = turns[i+1]
+                if (current_turn.get('speaker') == 'user' and
+                    next_turn.get('speaker') == 'assistant' and
+                    'text' in current_turn and
+                    'text' in next_turn):
+                    query = current_turn['text'].strip()
+                    positive = next_turn['text'].strip()
+                    # Skip short texts
+                    if (len(query.split()) < self.config.min_text_length or
+                        len(positive.split()) < self.config.min_text_length):
+                        continue
+                    # Get negative samples
+                    neg_samples = self._smart_negative_sampling(
+                        positive,
+                        response_pool,
+                        neg_samples_per_pos
+                    )
+                    if len(neg_samples) == neg_samples_per_pos:
+                        for neg in neg_samples:
+                            queries.append(query)
+                            positives.append(positive)
+                            negatives.append(neg)
+                    else:
+                        logger.warning(f"Insufficient negative samples for positive response: '{positive}'")
+        # Log final dataset statistics
+        logger.info(f"Final dataset statistics:")
+        logger.info(f"  Training examples: {len(queries)}")
+        logger.info(f"  Unique queries: {len(set(queries))}")
+        logger.info(f"  Unique responses: {len(set(positives))}")
+        return self._prepare_sequences(queries, positives, negatives)
+    def _smart_negative_sampling(
+        self,
+        positive: str,
+        response_pool: List[str],
+        n_samples: int,
+        max_attempts: int = 200,
+        similarity_bounds: Tuple[float, float] = (0.2, 0.9),
+        batch_size: int = 10
+    ) -> List[str]:
+        """Smart negative sampling with similarity bounds and fallback strategies."""
+        candidates = []
+        seen = set()
+        attempts = 0
+        while len(candidates) < n_samples and attempts < max_attempts:
+            remaining = min(batch_size, len(response_pool) - len(seen), max_attempts - attempts)
+            if remaining <= 0:
+                break
+            batch = random.sample(
+                [r for r in response_pool if r not in seen and r != positive],
+                remaining
+            )
+            for candidate in batch:
+                seen.add(candidate)
+                sim = self._improved_spacy_similarity(candidate, positive)
+                if similarity_bounds[0] < sim < similarity_bounds[1]:
+                    candidates.append(candidate)
+                    if len(candidates) == n_samples:
+                        break
+            attempts += len(batch)
+        if len(candidates) < n_samples:
+            logger.warning(f"Only found {len(candidates)} negative samples for positive response: '{positive}'")
+            # Fallback to random negatives without similarity constraints
+            fallback_needed = n_samples - len(candidates)
+            available_negatives = [r for r in response_pool if r != positive and r not in seen]
+            if available_negatives:
+                additional_negatives = random.sample(
+                    available_negatives,
+                    min(fallback_needed, len(available_negatives))
+                )
+                candidates.extend(additional_negatives)
+        return candidates
+    def _prepare_sequences(
+        self,
+        queries: List[str],
+        positives: List[str],
+        negatives: List[str]
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        """Modified sequence preparation for pretrained tokenizer."""
+        logger.info("Preparing sequences...")
+        # Process texts with special tokens
+        queries = [f"{self.special_tokens['user']} {q}" for q in queries]
+        positives = [f"{self.special_tokens['assistant']} {p}" for p in positives]
+        negatives = [f"{self.special_tokens['assistant']} {n}" for n in negatives]
+        # Tokenize using HuggingFace tokenizer
+        def encode_batch(texts: List[str]) -> tf.Tensor:
+            # HuggingFace tokenizer returns TensorFlow tensors when return_tensors='tf'
+            encodings = self.tokenizer(
+                texts,
+                padding='max_length',
+                truncation=True,
+                max_length=self.config.max_sequence_length,
+                return_tensors='tf'
+            )
+            return encodings['input_ids']
+        # Encode all sequences
+        q_tensor = encode_batch(queries)
+        p_tensor = encode_batch(positives)
+        n_tensor = encode_batch(negatives)
+        # Log statistics about encoded sequences
+        logger.info("Sequence statistics:")
+        logger.info(f"  Query sequence shape: {q_tensor.shape}")
+        logger.info(f"  Positive response sequence shape: {p_tensor.shape}")
+        logger.info(f"  Negative response sequence shape: {n_tensor.shape}")
+        return q_tensor, p_tensor, n_tensor
+    def train(
+        self,
+        q_pad: tf.Tensor,
+        p_pad: tf.Tensor,
+        n_pad: tf.Tensor,
+        epochs: int = 3,
+        batch_size: int = 32,
+        validation_split: float = 0.2,
+        checkpoint_dir: Optional[Union[str, Path]] = None
+    ):
+        """Train the model with improved training loop."""
+        # Setup training
+        total_samples = tf.shape(q_pad)[0]
+        train_size = int((1 - validation_split) * total_samples.numpy())
+        # Shuffle and split data
+        indices = tf.random.shuffle(tf.range(start=0, limit=total_samples, dtype=tf.int32))
+        train_idx = indices[:train_size]
+        val_idx = indices[train_size:]
+        # Split data using TF indexing
+        train_data = (
+            tf.gather(q_pad, train_idx),
+            tf.gather(p_pad, train_idx),
+            tf.gather(n_pad, train_idx)
+        )
+        val_data = (
+            tf.gather(q_pad, val_idx),
+            tf.gather(p_pad, val_idx),
+            tf.gather(n_pad, val_idx)
+        )
+        # Setup optimizer with learning rate schedule
+        steps_per_epoch = train_size // batch_size
+        total_steps = steps_per_epoch * epochs
+        lr_schedule = self._get_lr_schedule(
+            total_steps,
+            self.config.learning_rate,
+            self.config.warmup_steps
+        )
+        optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+        # Setup checkpointing
+        if checkpoint_dir:
+            checkpoint_dir = Path(checkpoint_dir)
+            checkpoint_dir.mkdir(parents=True, exist_ok=True)
+            # Setup checkpoint callback with correct file format
+            checkpoint_template = str(checkpoint_dir / "model_epoch_{epoch:04d}.weights.h5")
+            checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+                checkpoint_template,
+                save_weights_only=True,
+                save_best_only=True,
+                monitor='val_loss',
+                mode='min',
+                verbose=1
+            )
+        # Training loop
+        best_val_loss = float('inf')
+        patience = 5
+        wait = 0
+        for epoch in range(epochs):
+            # Training
+            train_loss = self._train_epoch(
+                train_data,
+                optimizer,
+                batch_size,
+                training=True
+            )
+            # Validation
+            val_loss = self._train_epoch(
+                val_data,
+                optimizer,
+                batch_size,
+                training=False
+            )
+            # Update history
+            self.history['train_loss'].append(train_loss)
+            self.history['val_loss'].append(val_loss)
+            logger.info(
+                f"Epoch {epoch + 1}/{epochs} - "
+                f"train_loss: {train_loss:.4f} - "
+                f"val_loss: {val_loss:.4f}"
+            )
+            # Early stopping
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                wait = 0
+                if checkpoint_dir:
+                    self.save_models(checkpoint_dir / f"best_model")
+            else:
+                wait += 1
+                if wait >= patience:
+                    logger.info("Early stopping triggered")
+                    break
+    def _train_epoch(
+        self,
+        data: Tuple[tf.Tensor, tf.Tensor, tf.Tensor],
+        optimizer: tf.keras.optimizers.Optimizer,
+        batch_size: int,
+        training: bool = True
+    ) -> float:
+        """Train for one epoch with enhanced logging and progress tracking."""
+        q_data, p_data, n_data = data
+        total_loss = 0.0
+        num_batches = tf.shape(q_data)[0] // batch_size
+        # Log current learning rate at start of epoch
+        if training:
+            if hasattr(optimizer.learning_rate, '__call__'):
+                current_lr = optimizer.learning_rate(optimizer.iterations)
+            else:
+                current_lr = optimizer.learning_rate
+            logger.info(f"Current learning rate: {float(current_lr):.6f}")
+        # Create progress bar
+        mode = "Training" if training else "Validation"
+        pbar = tqdm(
+            total=num_batches.numpy(),
+            desc=f"{mode} batches",
+            unit="batch",
+            dynamic_ncols=True
+        )
+        # Process batches
+        for i in range(num_batches):
+            start_idx = i * batch_size
+            end_idx = start_idx + batch_size
+            batch_loss = self._train_step(
+                q_data[start_idx:end_idx],
+                p_data[start_idx:end_idx],
+                n_data[start_idx:end_idx],
+                optimizer,
+                training
+            )
+            total_loss += batch_loss.numpy()
+            # Update progress bar with current loss
+            avg_loss = total_loss / (i + 1)
+            pbar.set_postfix({
+                'loss': f'{avg_loss:.4f}',
+                'lr': f'{float(current_lr):.6f}' if training else 'N/A'
+            })
+            pbar.update(1)
+        pbar.close()
+        return total_loss / num_batches.numpy() if num_batches > 0 else 0.0
+    @tf.function
+    def _train_step(
+        self,
+        q_batch: tf.Tensor,
+        p_batch: tf.Tensor,
+        n_batch: tf.Tensor,
+        optimizer: tf.keras.optimizers.Optimizer,
+        training: bool = True
+    ) -> tf.Tensor:
+        """Single training step with triplet loss."""
+        with tf.GradientTape() as tape:
+            # Get embeddings
+            q_emb = self.query_encoder(q_batch, training=training)
+            p_emb = self.response_encoder(p_batch, training=training)
+            n_emb = self.response_encoder(n_batch, training=training)
+            # Calculate triplet loss
+            pos_dist = tf.reduce_sum(tf.square(q_emb - p_emb), axis=1)
+            neg_dist = tf.reduce_sum(tf.square(q_emb - n_emb), axis=1)
+            loss = tf.maximum(0.0, self.config.margin + pos_dist - neg_dist)
+            loss = tf.reduce_mean(loss)
+        if training:
+            # Apply gradients
+            gradients = tape.gradient(
+                loss,
+                self.query_encoder.trainable_variables +
+                self.response_encoder.trainable_variables
+            )
+            optimizer.apply_gradients(zip(
+                gradients,
+                self.query_encoder.trainable_variables +
+                self.response_encoder.trainable_variables
+            ))
+        return loss
+    def _get_lr_schedule(
+        self,
+        total_steps: int,
+        peak_lr: float,
+        warmup_steps: int
+    ) -> tf.keras.optimizers.schedules.LearningRateSchedule:
+        """Enhanced learning rate schedule with better error handling and logging."""
+        class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+            def __init__(
+                self,
+                total_steps: int,
+                peak_lr: float,
+                warmup_steps: int
+            ):
+                super().__init__()
+                self.total_steps = tf.cast(total_steps, tf.float32)
+                self.peak_lr = tf.cast(peak_lr, tf.float32)
+                self.warmup_steps = tf.cast(max(1, warmup_steps), tf.float32)  # Prevent 0
+                # Calculate and store constants
+                self.initial_lr = self.peak_lr * 0.1  # Start at 10% of peak
+                self.min_lr = self.peak_lr * 0.01     # Minimum 1% of peak
+                logger.info(f"Learning rate schedule initialized:")
+                logger.info(f"  Initial LR: {float(self.initial_lr):.6f}")
+                logger.info(f"  Peak LR: {float(self.peak_lr):.6f}")
+                logger.info(f"  Min LR: {float(self.min_lr):.6f}")
+                logger.info(f"  Warmup steps: {int(self.warmup_steps)}")
+                logger.info(f"  Total steps: {int(self.total_steps)}")
+            def __call__(self, step):
+                step = tf.cast(step, tf.float32)
+                # Warmup phase
+                warmup_factor = tf.minimum(1.0, step / self.warmup_steps)
+                warmup_lr = self.initial_lr + (self.peak_lr - self.initial_lr) * warmup_factor
+                # Decay phase
+                decay_steps = tf.maximum(1.0, self.total_steps - self.warmup_steps)
+                decay_factor = (step - self.warmup_steps) / decay_steps
+                decay_factor = tf.minimum(tf.maximum(0.0, decay_factor), 1.0)  # Clip to [0,1]
+                cosine_decay = 0.5 * (1.0 + tf.cos(np.pi * decay_factor))
+                decay_lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
+                # Choose between warmup and decay
+                final_lr = tf.where(step < self.warmup_steps, warmup_lr, decay_lr)
+                # Ensure learning rate is valid
+                final_lr = tf.maximum(self.min_lr, final_lr)
+                final_lr = tf.where(tf.math.is_finite(final_lr), final_lr, self.min_lr)
+                return final_lr
+            def get_config(self):
+                return {
+                    "total_steps": self.total_steps,
+                    "peak_lr": self.peak_lr,
+                    "warmup_steps": self.warmup_steps,
+                }
+        return CustomSchedule(total_steps, peak_lr, warmup_steps)
+    def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> tf.Tensor:
+        """Encode a query with optional conversation context."""
+        # Prepare query with context
+        if context:
+            context_str = ' '.join([
+                f"{self.special_tokens['user']} {q} "
+                f"{self.special_tokens['assistant']} {r}"
+                for q, r in context[-self.config.max_context_turns:]
+            ])
+            query = f"{context_str} {self.special_tokens['user']} {query}"
+        else:
+            query = f"{self.special_tokens['user']} {query}"
+        # Tokenize and pad using TensorFlow tensors
+        encodings = self.tokenizer(
+            [query],
+            padding='max_length',
+            truncation=True,
+            max_length=self.config.max_sequence_length,
+            return_tensors='tf'
+        )
+        input_ids = encodings['input_ids']
+        return self.query_encoder(input_ids, training=False)
+    def encode_responses(self, responses: List[str]) -> tf.Tensor:
+        """Encode a batch of responses."""
+        # Prepare responses
+        responses = [
+            f"{self.special_tokens['assistant']} {r}"
+            for r in responses
+        ]
+        # Tokenize and pad using TensorFlow tensors
+        encodings = self.tokenizer(
+            responses,
+            padding='max_length',
+            truncation=True,
+            max_length=self.config.max_sequence_length,
+            return_tensors='tf'
+        )
+        input_ids = encodings['input_ids']
+        return self.response_encoder(input_ids, training=False)
+    def retrieve_responses(
+        self,
+        query: str,
+        candidates: List[str],
+        context: Optional[List[Tuple[str, str]]] = None,
+        top_k: int = 5
+    ) -> List[Tuple[str, float]]:
+        """Retrieve top-k responses for a query."""
+        # Encode query and candidates
+        q_emb = self.encode_query(query, context)
+        c_emb = self.encode_responses(candidates)
+        # Calculate similarities
+        similarities = tf.matmul(q_emb, c_emb, transpose_b=True).numpy()[0]
+        # Get top-k responses
+        top_indices = np.argsort(similarities)[::-1][:top_k]
+        return [(candidates[i], similarities[i]) for i in top_indices]
+    def chat(
+        self,
+        query: str,
+        response_pool: List[str],
+        conversation_history: Optional[List[Tuple[str, str]]] = None,
+        top_k: int = 5
+    ) -> Tuple[str, List[Tuple[str, float]]]:
+        """Interactive chat with response selection."""
+        # Get responses with scores
+        responses = self.retrieve_responses(
+            query,
+            response_pool,
+            conversation_history,
+            top_k
+        )
+        # Return best response and all candidates with scores
+        return responses[0][0], responses

chatbot4.py ADDED Viewed

	@@ -0,0 +1,1291 @@

+from transformers import TFAutoModel, AutoTokenizer
+import tensorflow as tf
+import numpy as np
+from typing import List, Tuple, Dict, Optional, Union, Any
+from dataclasses import dataclass
+import logging
+import json
+from tqdm import tqdm
+from pathlib import Path
+import faiss
+from response_quality_checker import ResponseQualityChecker
+policy = tf.keras.mixed_precision.Policy('mixed_float16')
+tf.keras.mixed_precision.set_global_policy(policy)
+# Configure logging
+logging.basicConfig(
+    level=logging.DEBUG,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class ChatbotConfig:
+    """Configuration for the RetrievalChatbot."""
+    vocab_size: int = 30526  # DistilBERT vocab size
+    max_sequence_length: int = 512
+    embedding_dim: int = 768  # Match DistilBERT's dimension
+    encoder_units: int = 256
+    num_attention_heads: int = 8
+    dropout_rate: float = 0.2
+    l2_reg_weight: float = 0.001
+    margin: float = 0.3
+    learning_rate: float = 0.001
+    min_text_length: int = 3
+    max_context_turns: int = 5
+    warmup_steps: int = 200
+    pretrained_model: str = 'distilbert-base-uncased'
+    dtype: str = 'float32'
+    freeze_embeddings: bool = False
+    # Additional configurations can be added here
+    def to_dict(self) -> dict:
+        """Convert config to dictionary."""
+        return {k: str(v) if isinstance(v, Path) else v
+                for k, v in self.__dict__.items()}
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> 'ChatbotConfig':
+        """Create config from dictionary."""
+        return cls(**{k: v for k, v in config_dict.items()
+                     if k in cls.__dataclass_fields__})
+class EncoderModel(tf.keras.Model):
+    """Dual encoder model with pretrained embeddings."""
+    def __init__(
+        self,
+        config: ChatbotConfig,
+        name: str = "encoder",
+        shared_weights: bool = False,
+        **kwargs
+    ):
+        super().__init__(name=name, **kwargs)
+        self.config = config
+        self.shared_weights = shared_weights
+        # Load pretrained model
+        self.pretrained = TFAutoModel.from_pretrained(config.pretrained_model)
+        # Freeze pretrained weights if specified
+        self.pretrained.distilbert.embeddings.trainable = False
+        for i, layer_module in enumerate(self.pretrained.distilbert.transformer.layer):
+            if i < 3:  # freeze first 2 layers
+                layer_module.trainable = False
+            else:
+                layer_module.trainable = True
+        # Pooling layer (Global Average Pooling)
+        self.pooler = tf.keras.layers.GlobalAveragePooling1D()
+        # Dropout and normalization
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+        self.normalize = tf.keras.layers.Lambda(
+            lambda x: tf.nn.l2_normalize(x, axis=1)
+        )
+    def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
+        """Forward pass."""
+        # Get pretrained embeddings
+        pretrained_outputs = self.pretrained(inputs, training=training)
+        x = pretrained_outputs.last_hidden_state  # Shape: [batch_size, seq_len, embedding_dim]
+        # Apply pooling
+        x = self.pooler(x)  # Shape: [batch_size, embedding_dim]
+        # Apply dropout
+        x = self.dropout(x, training=training)
+        # L2 normalization
+        x = self.normalize(x)  # Shape: [batch_size, embedding_dim]
+        return x
+    def get_config(self) -> dict:
+        """Return the config of the model."""
+        config = super().get_config()
+        config.update({
+            "config": self.config.to_dict(),
+            "shared_weights": self.shared_weights,
+            "name": self.name
+        })
+        return config
+# class CustomLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+#     def __init__(self, initial_lr, peak_lr, min_lr, warmup_steps, total_steps):
+#         super().__init__()
+#         self.initial_lr = initial_lr
+#         self.peak_lr = peak_lr
+#         self.min_lr = min_lr
+#         self.warmup_steps = min(warmup_steps, total_steps // 2)  # Ensure warmup_steps <= total_steps
+#         self.total_steps = total_steps
+#     def __call__(self, step):
+#         if step < self.warmup_steps:
+#             # Linear warmup
+#             lr = self.initial_lr + (self.peak_lr - self.initial_lr) * (step / self.warmup_steps)
+#         else:
+#             # Linear decay
+#             decay_steps = self.total_steps - self.warmup_steps
+#             if decay_steps > 0:
+#                 lr = self.peak_lr - (self.peak_lr - self.min_lr) * ((step - self.warmup_steps) / decay_steps)
+#             else:
+#                 lr = self.peak_lr
+#         return lr
+#     def get_config(self):
+#         return {
+#             "initial_lr": self.initial_lr,
+#             "peak_lr": self.peak_lr,
+#             "min_lr": self.min_lr,
+#             "warmup_steps": self.warmup_steps,
+#             "total_steps": self.total_steps,
+#         }
+class RetrievalChatbot:
+    """Retrieval-based chatbot using pretrained embeddings and FAISS for similarity search."""
+    def __init__(self, config: ChatbotConfig, dialogues: List[dict] = []):
+        self.config = config
+        # Special tokens
+        self.special_tokens = {
+            "user": "<USER>",
+            "assistant": "<ASSISTANT>",
+            "context": "<CONTEXT>",
+            "sep": "<SEP>"
+        }
+        # Initialize tokenizer and add special tokens
+        self.tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
+        self.tokenizer.add_special_tokens(
+            {'additional_special_tokens': list(self.special_tokens.values())}
+        )
+        # Build encoders
+        self._build_models()
+        # Initialize FAISS index
+        self._initialize_faiss()
+        # Precompute and index response embeddings
+        self._precompute_and_index_responses(dialogues)
+        # Initialize training history
+        self.history = {
+            "train_loss": [],
+            "val_loss": [],
+            "train_metrics": {},
+            "val_metrics": {}
+        }
+    def _build_models(self):
+        """Initialize the shared encoder."""
+        logger.info("Building encoder model...")
+        # Shared encoder for both queries and responses
+        self.encoder = EncoderModel(
+            self.config,
+            name="shared_encoder",
+        )
+        # Resize token embeddings after adding special tokens
+        new_vocab_size = len(self.tokenizer)
+        self.encoder.pretrained.resize_token_embeddings(new_vocab_size)
+        logger.info(f"Token embeddings resized to: {new_vocab_size}")
+        # Inspect embeddings attributes for debugging
+        logger.info("Inspecting embeddings attributes:")
+        for attr in dir(self.encoder.pretrained.distilbert.embeddings):
+            if not attr.startswith('_'):
+                logger.info(f"  {attr}")
+        # Verify embedding layers without accessing word_embeddings directly
+        embedding_dim = getattr(self.encoder.pretrained.distilbert.embeddings, 'embedding_dim', 'Unknown')
+        vocab_size = getattr(self.encoder.pretrained.distilbert.embeddings, 'input_dim', len(self.tokenizer))
+        logger.info(f"Encoder Embedding Dimension: {embedding_dim}")
+        logger.info(f"Encoder Embedding Vocabulary Size: {vocab_size}")
+        logger.info("Encoder model built and embeddings resized successfully.")
+        for var in self.encoder.pretrained.trainable_variables:
+            logger.info(f"{var.name}, {var.shape}")
+    def check_trainable_variables(self):
+        """Logs the trainable variables in both encoders."""
+        logger.info("Checking trainable variables in shared_encoder:")
+        for var in self.encoder.pretrained.trainable_variables:
+            logger.info(f"  {var.name}, shape: {var.shape}")
+        # logger.info("Checking trainable variables in response_encoder:")
+        # for var in self.response_encoder.pretrained.trainable_variables:
+        #     logger.info(f"  {var.name}, shape: {var.shape}")
+    def _initialize_faiss(self):
+        """Initialize FAISS index based on available resources."""
+        logger.info("Initializing FAISS index...")
+        # Determine if GPU FAISS is available
+        try:
+            res = faiss.StandardGpuResources()
+            self.faiss_gpu = True
+            logger.info("FAISS GPU resources initialized.")
+        except Exception as e:
+            self.faiss_gpu = False
+            logger.info("FAISS GPU resources not available. Using FAISS CPU.")
+        # Initialize FAISS index for Inner Product (for cosine similarity)
+        if self.faiss_gpu:
+            self.index = faiss.IndexFlatIP(self.config.embedding_dim)
+            self.index = faiss.index_cpu_to_gpu(res, 0, self.index)
+        else:
+            self.index = faiss.IndexFlatIP(self.config.embedding_dim)
+        logger.info("FAISS index initialized.")
+    def verify_faiss_index(chatbot):
+        """Verify that FAISS index matches the response pool."""
+        indexed_size = chatbot.index.ntotal
+        pool_size = len(chatbot.response_pool)
+        logger.info(f"FAISS index size: {indexed_size}")
+        logger.info(f"Response pool size: {pool_size}")
+        if indexed_size != pool_size:
+            logger.warning("Mismatch between FAISS index size and response pool size.")
+        else:
+            logger.info("FAISS index correctly matches the response pool.")
+    def _precompute_and_index_responses(self, dialogues: List[dict]):
+        """Precompute embeddings for all responses and index them using FAISS."""
+        logger.info("Precomputing response embeddings and indexing with FAISS...")
+        # Use tqdm for collecting responses
+        responses = []
+        for dialogue in tqdm(dialogues, desc="Collecting assistant responses"):
+            turns = dialogue.get('turns', [])
+            for turn in turns:
+                if turn.get('speaker') == 'assistant' and 'text' in turn:
+                    responses.append(turn['text'].strip())
+        # Remove duplicates
+        unique_responses = list(set(responses))
+        logger.info(f"Found {len(unique_responses)} unique responses.")
+        # Encode responses
+        response_embeddings = self.encode_responses(unique_responses)
+        response_embeddings = response_embeddings.numpy()
+        # Ensure float32
+        if response_embeddings.dtype != np.float32:
+            logger.info(f"Converting embeddings from {response_embeddings.dtype} to float32.")
+            response_embeddings = response_embeddings.astype('float32')
+        # Ensure the array is contiguous in memory
+        if not response_embeddings.flags['C_CONTIGUOUS']:
+            logger.info("Making embeddings contiguous in memory.")
+            response_embeddings = np.ascontiguousarray(response_embeddings)
+        # Normalize embeddings for cosine similarity
+        logger.info("Normalizing embeddings with FAISS.")
+        faiss.normalize_L2(response_embeddings)
+        # Add to FAISS index
+        logger.info("Adding embeddings to FAISS index...")
+        self.index.add(response_embeddings)
+        logger.info(f"Indexed {self.index.ntotal} responses.")
+        # Store responses and embeddings
+        self.response_pool = unique_responses
+        self.response_embeddings = response_embeddings
+        logger.info("Precomputation and indexing completed.")
+    def encode_responses(
+        self,
+        responses: List[str],
+        batch_size: int = 64
+    ) -> tf.Tensor:
+        """
+        Encodes a list of responses into embeddings, using chunked/batched processing
+        to avoid running out of memory when there are many responses.
+        Args:
+            responses (List[str]): The list of response texts to encode.
+            batch_size (int): How many responses to encode per chunk.
+                            Adjust based on available GPU/CPU memory.
+        Returns:
+            tf.Tensor: Tensor of shape (N, emb_dim) with all response embeddings.
+        """
+        logger.info(f"Encoding {len(responses)} responses in batches of size {batch_size}...")
+        # We'll accumulate embeddings in a list and concatenate at the end
+        all_embeddings = []
+        # Set up a progress bar
+        from tqdm import tqdm
+        pbar = tqdm(total=len(responses), desc="Encoding responses")
+        # Process the responses in chunks of 'batch_size'
+        for start_idx in range(0, len(responses), batch_size):
+            end_idx = start_idx + batch_size
+            batch_texts = responses[start_idx:end_idx]
+            # Tokenize the current batch
+            encodings = self.tokenizer(
+                batch_texts,
+                padding='max_length',
+                truncation=True,
+                max_length=self.config.max_sequence_length,
+                return_tensors='tf',
+            )
+            # Run the encoder forward pass
+            input_ids = encodings['input_ids']
+            embeddings_batch = self.encoder(input_ids, training=False)
+            # Cast to float32 if needed
+            if embeddings_batch.dtype != tf.float32:
+                embeddings_batch = tf.cast(embeddings_batch, tf.float32)
+            # Collect
+            all_embeddings.append(embeddings_batch)
+            # Update progress bar
+            pbar.update(len(batch_texts))
+        pbar.close()
+        # Concatenate all batch embeddings along axis=0
+        if len(all_embeddings) == 1:
+            # Only one batch
+            final_embeddings = all_embeddings[0]
+        else:
+            # Multiple batches, concatenate
+            final_embeddings = tf.concat(all_embeddings, axis=0)
+        logger.info(
+            f"Finished encoding {len(responses)} responses. "
+            f"Final shape: {final_embeddings.shape}"
+        )
+        return final_embeddings
+    def encode_query(self, query: str, context: Optional[List[Tuple[str, str]]] = None) -> tf.Tensor:
+        """Encode a query with optional conversation context."""
+        # Prepare query with context
+        if context:
+            context_str = ' '.join([
+                f"{self.special_tokens['user']} {q} "
+                f"{self.special_tokens['assistant']} {r}"
+                for q, r in context[-self.config.max_context_turns:]
+            ])
+            query = f"{context_str} {self.special_tokens['user']} {query}"
+        else:
+            query = f"{self.special_tokens['user']} {query}"
+        # Tokenize and encode
+        encodings = self.tokenizer(
+            [query],
+            padding='max_length',
+            truncation=True,
+            max_length=self.config.max_sequence_length,
+            return_tensors='tf'
+        )
+        input_ids = encodings['input_ids']
+        # Verify token IDs
+        max_id = tf.reduce_max(input_ids).numpy()
+        new_vocab_size = len(self.tokenizer)
+        logger.info(f"Maximum input_id: {max_id}, Vocab Size: {new_vocab_size}")
+        if max_id >= new_vocab_size:
+            logger.error(f"Token ID {max_id} exceeds the vocabulary size {new_vocab_size}.")
+            raise ValueError("Token ID exceeds vocabulary size.")
+        # Get embeddings from the shared encoder
+        return self.encoder(input_ids, training=False)
+    def retrieve_responses_faiss(self, query: str, top_k: int = 5) -> List[Tuple[str, float]]:
+        """Retrieve top-k responses using FAISS."""
+        # Encode the query
+        q_emb = self.encode_query(query)  # Shape: [1, embedding_dim]
+        q_emb_np = q_emb.numpy().astype('float32')  # Ensure type matches FAISS requirements
+        # Normalize the query embedding for cosine similarity
+        faiss.normalize_L2(q_emb_np)
+        # Search the FAISS index
+        distances, indices = self.index.search(q_emb_np, top_k)
+        # Map indices to responses and distances to similarities
+        top_responses = []
+        for i, idx in enumerate(indices[0]):
+            if idx < len(self.response_pool):
+                top_responses.append((self.response_pool[idx], float(distances[0][i])))
+            else:
+                logger.warning(f"FAISS returned invalid index {idx}. Skipping.")
+        return top_responses
+    def save_models(self, save_dir: Union[str, Path]):
+        """Save models and configuration."""
+        save_dir = Path(save_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        # Save config
+        with open(save_dir / "config.json", "w") as f:
+            json.dump(self.config.to_dict(), f, indent=2)
+        # Save models
+        self.encoder.pretrained.save_pretrained(save_dir / "shared_encoder")
+        # Save tokenizer
+        self.tokenizer.save_pretrained(save_dir / "tokenizer")
+        logger.info(f"Models and tokenizer saved to {save_dir}.")
+    @classmethod
+    def load_models(cls, load_dir: Union[str, Path]) -> 'RetrievalChatbot':
+        """Load saved models and configuration."""
+        load_dir = Path(load_dir)
+        # Load config
+        with open(load_dir / "config.json", "r") as f:
+            config = ChatbotConfig.from_dict(json.load(f))
+        # Initialize chatbot
+        chatbot = cls(config)
+        # Load models
+        chatbot.encoder.pretrained = TFAutoModel.from_pretrained(
+            load_dir / "shared_encoder",
+            config=config
+        )
+        # chatbot.response_encoder.pretrained = TFAutoModel.from_pretrained(
+        #     load_dir / "response_encoder",
+        #     config=config
+        # )
+        # Load tokenizer
+        chatbot.tokenizer = AutoTokenizer.from_pretrained(load_dir / "tokenizer")
+        logger.info(f"Models and tokenizer loaded from {load_dir}.")
+        return chatbot
+    @staticmethod
+    def load_training_data(data_path: Union[str, Path], debug_samples: Optional[int] = None) -> List[dict]:
+        """
+        Load training data from a JSON file.
+        Args:
+            data_path (Union[str, Path]): Path to the JSON file containing dialogues.
+            debug_samples (Optional[int]): Number of samples to load for debugging.
+        Returns:
+            List[dict]: List of dialogue dictionaries.
+        """
+        logger.info(f"Loading training data from {data_path}...")
+        data_path = Path(data_path)
+        if not data_path.exists():
+            logger.error(f"Data file {data_path} does not exist.")
+            return []
+        with open(data_path, 'r', encoding='utf-8') as f:
+            dialogues = json.load(f)
+        if debug_samples is not None:
+            dialogues = dialogues[:debug_samples]
+            logger.info(f"Debug mode: Limited to {debug_samples} dialogues")
+        logger.info(f"Loaded {len(dialogues)} dialogues.")
+        return dialogues
+    def prepare_dataset(
+        self,
+        dialogues: List[dict],
+        debug_samples: int = None
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """
+        Prepares dataset for in-batch negatives:
+        Only returns (query, positive) pairs.
+        """
+        logger.info("Preparing in-batch dataset...")
+        queries, positives = [], []
+        for dialogue in dialogues:
+            turns = dialogue.get('turns', [])
+            for i in range(len(turns) - 1):
+                current_turn = turns[i]
+                next_turn = turns[i+1]
+                if (current_turn.get('speaker') == 'user' and
+                    next_turn.get('speaker') == 'assistant' and
+                    'text' in current_turn and
+                    'text' in next_turn):
+                    query = current_turn['text'].strip()
+                    positive = next_turn['text'].strip()
+                    queries.append(query)
+                    positives.append(positive)
+        # Optional debug slicing
+        if debug_samples is not None:
+            queries = queries[:debug_samples]
+            positives = positives[:debug_samples]
+            logger.info(f"Debug mode: limited to {debug_samples} pairs.")
+        logger.info(f"Prepared {len(queries)} (query, positive) pairs.")
+        # Tokenize queries
+        encoded_queries = self.tokenizer(
+            queries,
+            padding='max_length',
+            truncation=True,
+            max_length=self.config.max_sequence_length,
+            return_tensors='tf'
+        )
+        # Tokenize positives
+        encoded_positives = self.tokenizer(
+            positives,
+            padding='max_length',
+            truncation=True,
+            max_length=self.config.max_sequence_length,
+            return_tensors='tf'
+        )
+        q_tensor = encoded_queries['input_ids']
+        p_tensor = encoded_positives['input_ids']
+        logger.info("Tokenized and padded sequences for in-batch training.")
+        return q_tensor, p_tensor
+    def train(
+        self,
+        q_pad: tf.Tensor,
+        p_pad: tf.Tensor,
+        epochs: int,
+        batch_size: int,
+        validation_split: float,
+        checkpoint_dir: str,
+        use_lr_schedule: bool = True,
+        peak_lr: float = 2e-5,
+        warmup_steps_ratio: float = 0.1,
+        early_stopping_patience: int = 3,
+        min_delta: float = 1e-4
+    ):
+        dataset_size = tf.shape(q_pad)[0].numpy()
+        val_size = int(dataset_size * validation_split)
+        train_size = dataset_size - val_size
+        logger.info(f"Total samples: {dataset_size}")
+        logger.info(f"Training samples: {train_size}")
+        logger.info(f"Validation samples: {val_size}")
+        steps_per_epoch = train_size // batch_size
+        if train_size % batch_size != 0:
+            steps_per_epoch += 1
+        total_steps = steps_per_epoch * epochs
+        logger.info(f"Total training steps (approx): {total_steps}")
+        # 1) Set up LR schedule or fixed LR
+        if use_lr_schedule:
+            warmup_steps = int(total_steps * warmup_steps_ratio)
+            lr_schedule = self._get_lr_schedule(
+                total_steps=total_steps,
+                peak_lr=peak_lr,
+                warmup_steps=warmup_steps
+            )
+            self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+            logger.info("Using custom learning rate schedule.")
+        else:
+            self.optimizer = tf.keras.optimizers.Adam(learning_rate=peak_lr)
+            logger.info("Using fixed learning rate.")
+        # 2) Prepare data splits
+        train_q = q_pad[:train_size]
+        train_p = p_pad[:train_size]
+        val_q = q_pad[train_size:]
+        val_p = p_pad[train_size:]
+        train_dataset = tf.data.Dataset.from_tensor_slices((train_q, train_p))
+        train_dataset = train_dataset.shuffle(buffer_size=4096).batch(batch_size)
+        val_dataset = tf.data.Dataset.from_tensor_slices((val_q, val_p))
+        val_dataset = val_dataset.batch(batch_size)
+        # 3) Checkpoint + manager
+        checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
+        manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
+        # 4) TensorBoard setup
+        import datetime
+        import os
+        from pathlib import Path
+        log_dir = Path(checkpoint_dir) / "tensorboard_logs"
+        log_dir.mkdir(parents=True, exist_ok=True)
+        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+        train_log_dir = str(log_dir / f"train_{current_time}")
+        val_log_dir = str(log_dir / f"val_{current_time}")
+        train_summary_writer = tf.summary.create_file_writer(train_log_dir)
+        val_summary_writer = tf.summary.create_file_writer(val_log_dir)
+        logger.info(f"TensorBoard logs will be saved in {log_dir}")
+        # 5) Early stopping
+        best_val_loss = float("inf")
+        epochs_no_improve = 0
+        logger.info("Beginning training loop...")
+        global_step = 0
+        from tqdm import tqdm
+        for epoch in range(1, epochs + 1):
+            logger.info(f"\n=== Epoch {epoch}/{epochs} ===")
+            epoch_loss_avg = tf.keras.metrics.Mean()
+            # Training loop
+            with tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}") as pbar:
+                for (q_batch, p_batch) in train_dataset:
+                    global_step += 1
+                    # Train step
+                    batch_loss = self._train_step(q_batch, p_batch)
+                    epoch_loss_avg(batch_loss)
+                    # Get current LR
+                    if use_lr_schedule:
+                        lr = self.optimizer.learning_rate
+                        if isinstance(lr, tf.keras.optimizers.schedules.LearningRateSchedule):
+                            # Get the current step
+                            current_step = tf.cast(self.optimizer.iterations, tf.float32)
+                            # Compute the current learning rate
+                            current_lr = lr(current_step)
+                        else:
+                            # If learning_rate is not a schedule, use it directly
+                            current_lr = lr
+                        # Convert to float for logging
+                        current_lr_value = float(current_lr.numpy())
+                    else:
+                        # If using fixed learning rate
+                        current_lr_value = float(self.optimizer.learning_rate.numpy())
+                    # Update tqdm
+                    pbar.update(1)
+                    pbar.set_postfix({
+                        "loss": f"{batch_loss.numpy():.4f}",
+                        "lr": f"{current_lr_value:.2e}"
+                    })
+                    # TensorBoard: log train metrics per step
+                    with train_summary_writer.as_default():
+                        tf.summary.scalar("loss", batch_loss, step=global_step)
+                        tf.summary.scalar("learning_rate", current_lr_value, step=global_step)
+            # Validation
+            val_loss_avg = tf.keras.metrics.Mean()
+            for q_val, p_val in val_dataset:
+                q_enc = self.encoder(q_val, training=False)
+                p_enc = self.encoder(p_val, training=False)
+                sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)
+                bs_val = tf.shape(q_enc)[0]
+                labels_val = tf.range(bs_val, dtype=tf.int32)
+                loss_val = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                    labels=labels_val,
+                    logits=sim_matrix
+                )
+                val_loss_avg(tf.reduce_mean(loss_val))
+            train_loss = epoch_loss_avg.result().numpy()
+            val_loss = val_loss_avg.result().numpy()
+            logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
+            # TensorBoard: validation loss
+            with val_summary_writer.as_default():
+                tf.summary.scalar("val_loss", val_loss, step=epoch)
+            # Save checkpoint
+            manager.save()
+            # Update history
+            self.history['train_loss'].append(train_loss)
+            self.history['val_loss'].append(val_loss)
+            self.history.setdefault('learning_rate', []).append(float(current_lr_value))
+            # Early stopping
+            if val_loss < best_val_loss - min_delta:
+                best_val_loss = val_loss
+                epochs_no_improve = 0
+                logger.info(f"Validation loss improved to {val_loss:.4f}. Reset patience.")
+            else:
+                epochs_no_improve += 1
+                logger.info(f"No improvement this epoch. Patience: {epochs_no_improve}/{early_stopping_patience}")
+                if epochs_no_improve >= early_stopping_patience:
+                    logger.info("Early stopping triggered.")
+                    break
+        logger.info("In-batch training completed!")
+    @tf.function
+    def _train_step(self, q_batch, p_batch):
+        """
+        Single training step using in-batch negatives.
+        q_batch: (batch_size, seq_len) int32 input_ids for queries
+        p_batch: (batch_size, seq_len) int32 input_ids for positives
+        """
+        with tf.GradientTape() as tape:
+            # Encode queries and positives
+            q_enc = self.encoder(q_batch, training=True)  # [B, emb_dim]
+            p_enc = self.encoder(p_batch, training=True)  # [B, emb_dim]
+            # Compute similarity matrix: (B, B) = q_enc * p_enc^T
+            # If embeddings are L2-normalized, this is cosine similarity
+            sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)  # [B, B]
+            # Labels are just the diagonal indices
+            batch_size = tf.shape(q_enc)[0]
+            labels = tf.range(batch_size, dtype=tf.int32)  # [0..B-1]
+            # Softmax cross-entropy
+            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=labels,
+                logits=sim_matrix
+            )
+            loss = tf.reduce_mean(loss)
+        # Compute gradients for the pretrained DistilBERT variables only
+        train_vars = self.encoder.pretrained.trainable_variables
+        gradients = tape.gradient(loss, train_vars)
+        # Remove any None grads (in case some layers are frozen)
+        grads_and_vars = [(g, v) for g, v in zip(gradients, train_vars) if g is not None]
+        if grads_and_vars:
+            self.optimizer.apply_gradients(grads_and_vars)
+        return loss
+    def _prepare_sequences(
+        self,
+        queries: List[str],
+        positives: List[str],
+        negatives: List[str]
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        """Prepare and tokenize sequences for training."""
+        logger.info("Preparing sequences for training...")
+        # Handle empty lists
+        if not queries:
+            logger.error("No queries to encode. Skipping sequence preparation.")
+            return tf.constant([]), tf.constant([]), tf.constant([])
+        # Process texts with special tokens
+        queries = [f"{self.special_tokens['user']} {q}" for q in queries]
+        positives = [f"{self.special_tokens['assistant']} {p}" for p in positives]
+        negatives = [f"{self.special_tokens['assistant']} {n}" for n in negatives]
+        # Tokenize using HuggingFace tokenizer
+        def encode_batch(texts: List[str]) -> tf.Tensor:
+            if not texts:
+                logger.error("Empty text list provided to tokenizer.")
+                return tf.constant([])
+            encodings = self.tokenizer(
+                texts,
+                padding='max_length',
+                truncation=True,
+                max_length=self.config.max_sequence_length,
+                return_tensors='tf'
+            )
+            return encodings['input_ids']
+        # Encode all sequences
+        q_tensor = encode_batch(queries)
+        p_tensor = encode_batch(positives)
+        n_tensor = encode_batch(negatives)
+        # Log statistics about encoded sequences
+        logger.info("Sequence statistics:")
+        logger.info(f"  Query sequence shape: {q_tensor.shape}")
+        logger.info(f"  Positive response sequence shape: {p_tensor.shape}")
+        logger.info(f"  Negative response sequence shape: {n_tensor.shape}")
+        return q_tensor, p_tensor, n_tensor
+    def _get_lr_schedule(
+        self,
+        total_steps: int,
+        peak_lr: float,
+        warmup_steps: int
+    ) -> tf.keras.optimizers.schedules.LearningRateSchedule:
+        """Create a custom learning rate schedule with warmup and cosine decay."""
+        class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+            def __init__(
+                self,
+                total_steps: int,
+                peak_lr: float,
+                warmup_steps: int
+            ):
+                super().__init__()
+                self.total_steps = tf.cast(total_steps, tf.float32)
+                self.peak_lr = tf.cast(peak_lr, tf.float32)
+                # Adjust warmup_steps to not exceed half of total_steps
+                adjusted_warmup_steps = min(warmup_steps, max(1, total_steps // 10))
+                self.warmup_steps = tf.cast(adjusted_warmup_steps, tf.float32)
+                # Calculate and store constants
+                self.initial_lr = self.peak_lr * 0.1  # Start at 10% of peak
+                self.min_lr = self.peak_lr * 0.01     # Minimum 1% of peak
+                logger.info(f"Learning rate schedule initialized:")
+                logger.info(f"  Initial LR: {float(self.initial_lr):.6f}")
+                logger.info(f"  Peak LR: {float(self.peak_lr):.6f}")
+                logger.info(f"  Min LR: {float(self.min_lr):.6f}")
+                logger.info(f"  Warmup steps: {int(self.warmup_steps)}")
+                logger.info(f"  Total steps: {int(self.total_steps)}")
+            def __call__(self, step):
+                step = tf.cast(step, tf.float32)
+                # Warmup phase
+                warmup_factor = tf.minimum(1.0, step / self.warmup_steps)
+                warmup_lr = self.initial_lr + (self.peak_lr - self.initial_lr) * warmup_factor
+                # Decay phase
+                decay_steps = tf.maximum(1.0, self.total_steps - self.warmup_steps)
+                decay_factor = (step - self.warmup_steps) / decay_steps
+                decay_factor = tf.minimum(tf.maximum(0.0, decay_factor), 1.0)  # Clip to [0,1]
+                cosine_decay = 0.5 * (1.0 + tf.cos(np.pi * decay_factor))
+                decay_lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
+                # Choose between warmup and decay
+                final_lr = tf.where(step < self.warmup_steps, warmup_lr, decay_lr)
+                # Ensure learning rate is valid
+                final_lr = tf.maximum(self.min_lr, final_lr)
+                final_lr = tf.where(tf.math.is_finite(final_lr), final_lr, self.min_lr)
+                return final_lr
+            def get_config(self):
+                return {
+                    "total_steps": self.total_steps,
+                    "peak_lr": self.peak_lr,
+                    "warmup_steps": self.warmup_steps,
+                }
+        return CustomSchedule(total_steps, peak_lr, warmup_steps)
+    def _cosine_similarity(self, emb1: np.ndarray, emb2: np.ndarray) -> np.ndarray:
+        """Compute cosine similarity between two numpy arrays."""
+        normalized_emb1 = emb1 / np.linalg.norm(emb1, axis=1, keepdims=True)
+        normalized_emb2 = emb2 / np.linalg.norm(emb2, axis=1, keepdims=True)
+        return np.dot(normalized_emb1, normalized_emb2.T)
+    def run_automatic_validation(
+        self,
+        quality_checker: 'ResponseQualityChecker',
+        num_examples: int = 5
+    ) -> Dict[str, Any]:
+        """
+        Run automatic validation with quality metrics using FAISS-based retrieval.
+        """
+        logger.info("\n=== Running Automatic Validation ===")
+        test_queries = [
+            "Hello, how are you today?",
+            "What's the weather like?",
+            "Can you help me with a problem?",
+            "Tell me a joke",
+            "What time is it?",
+            "I need help with my homework",
+            "Where's a good place to eat?",
+            "What movies are playing?",
+            "How do I reset my password?",
+            "Can you recommend a book?"
+        ]
+        test_queries = test_queries[:num_examples]
+        metrics_history = []
+        for i, query in enumerate(test_queries, 1):
+            logger.info(f"\nTest Case {i}:")
+            logger.info(f"Query: {query}")
+            # Get responses and scores using FAISS
+            responses = self.retrieve_responses_faiss(query, top_k=5)
+            # Check quality
+            quality_metrics = quality_checker.check_response_quality(query, responses)
+            metrics_history.append(quality_metrics)
+            # Log results
+            logger.info(f"Quality Metrics: {quality_metrics}")
+            logger.info("Top responses:")
+            for j, (response, score) in enumerate(responses[:3], 1):
+                logger.info(f"{j}. Score: {score:.4f}")
+                logger.info(f"   Response: {response}")
+                if j == 1 and not quality_metrics.get('is_confident', False):
+                    logger.info("   [Low Confidence - Would abstain from answering]")
+        # Calculate aggregate metrics
+        aggregate_metrics = {
+            'num_queries_tested': len(test_queries),
+            'avg_top_response_score': np.mean([m.get('top_score', 0) for m in metrics_history]),
+            'avg_diversity': np.mean([m.get('response_diversity', 0) for m in metrics_history]),
+            'avg_relevance': np.mean([m.get('query_response_relevance', 0) for m in metrics_history]),
+            'avg_length_score': np.mean([m.get('response_length_score', 0) for m in metrics_history]),
+            'avg_score_gap': np.mean([m.get('top_3_score_gap', 0) for m in metrics_history]),
+            'confidence_rate': np.mean([m.get('is_confident', False) for m in metrics_history]),
+        }
+        logger.info("\n=== Validation Summary ===")
+        for metric, value in aggregate_metrics.items():
+            logger.info(f"{metric}: {value:.4f}")
+        return aggregate_metrics
+    def chat(
+        self,
+        query: str,
+        conversation_history: Optional[List[Tuple[str, str]]] = None,
+        quality_checker: Optional['ResponseQualityChecker'] = None,
+        top_k: int = 5
+    ) -> Tuple[str, List[Tuple[str, float]], Dict[str, Any]]:
+        """
+        Interactive chat function with quality checking using FAISS-based retrieval.
+        Args:
+            query (str): The user's input query.
+            conversation_history (Optional[List[Tuple[str, str]]]): List of past (user, assistant) exchanges.
+            quality_checker (Optional['ResponseQualityChecker']): Quality checker instance.
+            top_k (int): Number of top responses to retrieve.
+        Returns:
+            Tuple[str, List[Tuple[str, float]], Dict[str, Any]]: (Response, Candidates, Quality Metrics)
+        """
+        # Retrieve responses using FAISS
+        responses = self.retrieve_responses_faiss(query, top_k)
+        # If no quality checker provided, return the top response
+        if quality_checker is None:
+            return responses[0][0] if responses else "I'm sorry, I don't have an answer for that.", responses, {}
+        # Check quality
+        quality_metrics = quality_checker.check_response_quality(query, responses)
+        if quality_metrics.get('is_confident', False):
+            return responses[0][0], responses, quality_metrics
+        else:
+            uncertainty_response = (
+                "I apologize, but I don't feel confident providing an answer to that "
+                "question at the moment. Could you please rephrase or ask something else?"
+            )
+            return uncertainty_response, responses, quality_metrics
+    # TODO: consider removal
+    # def prepare_dataset(self, dialogues: List[dict], neg_samples_per_pos: int = 1, debug_samples: int = None) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    #     """Prepares the dataset for training."""
+    #     logger.info("Preparing dataset...")
+    #     # Extract (query, positive, negative) triples
+    #     queries, positives, negatives = [], [], []
+    #     for dialogue in dialogues:
+    #         turns = dialogue.get('turns', [])
+    #         for i in range(len(turns) - 1):
+    #             current_turn = turns[i]
+    #             next_turn = turns[i+1]
+    #             if (current_turn.get('speaker') == 'user' and
+    #                 next_turn.get('speaker') == 'assistant' and
+    #                 'text' in current_turn and
+    #                 'text' in next_turn):
+    #                 query = current_turn['text'].strip()
+    #                 positive = next_turn['text'].strip()
+    #                 # Generate hard negative samples
+    #                 hard_negatives = self.hard_negative_sampling(positive, n_samples=neg_samples_per_pos)
+    #                 for negative in hard_negatives:
+    #                     negatives.append(negative)
+    #                     queries.append(query)
+    #                     positives.append(positive)
+    #     logger.info(f"Prepared {len(queries)} training examples.")
+    #     # Tokenize and pad sequences
+    #     encoded_queries = self.tokenizer(
+    #         queries,
+    #         padding='max_length',
+    #         truncation=True,
+    #         max_length=self.config.max_sequence_length,
+    #         return_tensors='tf'
+    #     )
+    #     encoded_positives = self.tokenizer(
+    #         positives,
+    #         padding='max_length',
+    #         truncation=True,
+    #         max_length=self.config.max_sequence_length,
+    #         return_tensors='tf'
+    #     )
+    #     encoded_negatives = self.tokenizer(
+    #         negatives,
+    #         padding='max_length',
+    #         truncation=True,
+    #         max_length=self.config.max_sequence_length,
+    #         return_tensors='tf'
+    #     )
+    #     q_tensor = encoded_queries['input_ids']
+    #     p_tensor = encoded_positives['input_ids']
+    #     n_tensor = encoded_negatives['input_ids']
+    #     logger.info(f"Tokenized and padded sequences.")
+    #     return q_tensor, p_tensor, n_tensor
+    # # TODO: consider removal
+    # def hard_negative_sampling(self, positive_response, n_samples=1):
+    #     """Select hard negatives based on cosine similarity."""
+    #     try:
+    #         # Ensure we don't request more negatives than available
+    #         max_neg_samples = len(self.response_pool) - 1  # Exclude the positive response
+    #         n_samples = min(n_samples, max_neg_samples)
+    #         if n_samples <= 0:
+    #             logger.error("Not enough responses to sample negatives.")
+    #             return []
+    #         # Encode the positive response using the chatbot's encode_responses method
+    #         pos_emb = self.encode_responses([positive_response]).numpy()
+    #         faiss.normalize_L2(pos_emb)
+    #         #logger.info(f"Normalized positive embedding for response: {positive_response}")
+    #         # Search for the top n_samples + 1 most similar responses (including the positive itself)
+    #         D, I = self.index.search(pos_emb, n_samples + 1)
+    #         #logger.info(f"FAISS search results: {I}")
+    #         # Exclude the positive response itself (assuming it's indexed)
+    #         negatives = []
+    #         for i in range(n_samples):
+    #             idx = I[0][i + 1]  # Skip the first one as it's the positive
+    #             if idx < len(self.response_pool):
+    #                 negative_response = self.response_pool[idx]
+    #                 negatives.append(negative_response)
+    #                 logger.info(f"Selected negative: {negative_response}")
+    #             else:
+    #                 logger.warning(f"Index {idx} out of range for response_pool with size {len(self.response_pool)}.")
+    #         return negatives
+    #     except Exception as e:
+    #         logger.error(f"An error occurred during hard negative sampling: {e}")
+    #         return []
+        # def train(
+    #     self,
+    #     q_pad: tf.Tensor,
+    #     p_pad: tf.Tensor,
+    #     n_pad: tf.Tensor,
+    #     epochs: int,
+    #     batch_size: int,
+    #     validation_split: float,
+    #     checkpoint_dir: str,
+    #     callbacks: Optional[List[tf.keras.callbacks.Callback]] = None
+    # ):
+    #     """
+    #     Train the chatbot model.
+    #     Args:
+    #         q_pad (tf.Tensor): Padded query input_ids.
+    #         p_pad (tf.Tensor): Padded positive response input_ids.
+    #         n_pad (tf.Tensor): Padded negative response input_ids.
+    #         epochs (int): Number of training epochs.
+    #         batch_size (int): Training batch size.
+    #         validation_split (float): Fraction of data to use for validation.
+    #         checkpoint_dir (str): Directory to save model checkpoints.
+    #         callbacks (list, optional): List of Keras callbacks.
+    #     """
+    #     dataset_size = tf.shape(q_pad)[0].numpy()
+    #     val_size = int(dataset_size * validation_split)
+    #     train_size = dataset_size - val_size
+    #     logger.info(f"Total samples: {dataset_size}")
+    #     logger.info(f"Training samples: {train_size}")
+    #     logger.info(f"Validation samples: {val_size}")
+    #     # Calculate steps_per_epoch
+    #     steps_per_epoch = train_size // batch_size
+    #     if train_size % batch_size != 0:
+    #         steps_per_epoch += 1
+    #     total_steps = steps_per_epoch * epochs
+    #     logger.info(f"Total training steps: {total_steps}")
+    #     # Initialize learning rate schedule with adjusted warmup_steps
+    #     lr_schedule = self._get_lr_schedule(
+    #         total_steps=total_steps,
+    #         peak_lr=self.config.learning_rate,
+    #         warmup_steps=self.config.warmup_steps
+    #     )
+    #     #     callbacks = []
+    #     #     if checkpoint_dir:
+    #     #         checkpoint_dir = Path(checkpoint_dir)
+    #     #         checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    #     #         # Setup checkpoint callback with correct file format
+    #     #         checkpoint_template = str(checkpoint_dir / "model_epoch_{epoch:04d}.weights.h5")
+    #     #         checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+    #     #             checkpoint_template,
+    #     #             save_weights_only=True,
+    #     #             save_best_only=True,
+    #     #             monitor='val_loss',
+    #     #             mode='min',
+    #     #             verbose=1
+    #     #         )
+    #     #         callbacks.append(checkpoint_callback)
+    #     #     # Early stopping callback
+    #     #     early_stopping = tf.keras.callbacks.EarlyStopping(
+    #     #         monitor='val_loss',
+    #     #         patience=5,
+    #     #         restore_best_weights=True,
+    #     #         verbose=1
+    #     #     )
+    #     #     callbacks.append(early_stopping)
+    #     #     # TensorBoard callback
+    #     #     tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs')
+    #     #     callbacks.append(tensorboard_callback)
+    #     # Update optimizer with the new learning rate schedule
+    #     self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+    #     # Split the data
+    #     train_q = q_pad[:train_size]
+    #     train_p = p_pad[:train_size]
+    #     train_n = n_pad[:train_size]
+    #     val_q = q_pad[train_size:]
+    #     val_p = p_pad[train_size:]
+    #     val_n = n_pad[train_size:]
+    #     # Create TensorFlow datasets
+    #     train_dataset = tf.data.Dataset.from_tensor_slices((train_q, train_p, train_n))
+    #     train_dataset = train_dataset.shuffle(buffer_size=1000).batch(batch_size)
+    #     val_dataset = tf.data.Dataset.from_tensor_slices((val_q, val_p, val_n))
+    #     val_dataset = val_dataset.batch(batch_size)
+    #     # Log dataset sizes
+    #     logger.info(f"Training dataset batches: {len(list(train_dataset))}")
+    #     logger.info(f"Validation dataset batches: {len(list(val_dataset))}")
+    #     # Create checkpoint manager
+    #     checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.encoder)
+    #     manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=3)
+    #     for epoch in range(1, epochs + 1):
+    #         logger.info(f"Epoch {epoch}/{epochs}")
+    #         epoch_loss_avg = tf.keras.metrics.Mean()
+    #         # Training loop
+    #         for q_batch, p_batch, n_batch in train_dataset:
+    #             batch_loss = self._train_step(q_batch, p_batch, n_batch)
+    #             epoch_loss_avg(batch_loss)
+    #         # Validation loop
+    #         val_loss_avg = tf.keras.metrics.Mean()
+    #         try:
+    #             for q_val, p_val, n_val in val_dataset:
+    #                 # Encode queries, positives, and negatives without training
+    #                 q_enc = self.encoder(q_val, training=False)
+    #                 p_enc = self.encoder(p_val, training=False)
+    #                 n_enc = self.encoder(n_val, training=False)
+    #                 # Compute cosine similarities
+    #                 pos_sim = tf.reduce_sum(tf.multiply(q_enc, p_enc), axis=1)
+    #                 neg_sim = tf.reduce_sum(tf.multiply(q_enc, n_enc), axis=1)
+    #                 # Ensure similarities are float32
+    #                 pos_sim = tf.cast(pos_sim, tf.float32)
+    #                 neg_sim = tf.cast(neg_sim, tf.float32)
+    #                 # Compute loss with margin
+    #                 margin = tf.cast(self.config.margin, tf.float32)
+    #                 loss = tf.maximum(0.0, margin - pos_sim + neg_sim)
+    #                 val_loss_avg(tf.reduce_mean(loss))
+    #                 # Optional: Log individual batch validation loss
+    #                 logger.debug(f"Batch Validation Loss: {tf.reduce_mean(loss).numpy():.6f}")
+    #             train_loss = epoch_loss_avg.result().numpy()
+    #             val_loss = val_loss_avg.result().numpy()
+    #             logger.info(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
+    #             # Save checkpoint
+    #             manager.save()
+    #             # Update history
+    #             self.history['train_loss'].append(train_loss)
+    #             self.history['val_loss'].append(val_loss)
+    #             # Invoke callbacks if any
+    #             if callbacks:
+    #                 for callback in callbacks:
+    #                     callback.on_epoch_end(epoch, logs={'loss': train_loss, 'val_loss': val_loss})
+    #         except tf.errors.OutOfRangeError:
+    #             logger.warning("Validation dataset is exhausted before expected.")
+    #             self.history['val_loss'].append(val_loss_avg.result().numpy())
+    #     logger.info("Training completed.")
+    # @tf.function
+    # def _train_step(self, q_batch, p_batch, n_batch):
+    #     """
+    #     Performs a single training step with query, positive, and negative batches.
+    #     Args:
+    #         q_batch (tf.Tensor): Batch of query input_ids.
+    #         p_batch (tf.Tensor): Batch of positive response input_ids.
+    #         n_batch (tf.Tensor): Batch of negative response input_ids.
+    #     Returns:
+    #         tf.Tensor: Mean loss for the batch.
+    #     """
+    #     with tf.GradientTape() as tape:
+    #         # Encode queries, positives, and negatives using the shared encoder
+    #         q_enc = self.encoder(q_batch, training=True)  # Shape: (batch_size, embedding_dim)
+    #         p_enc = self.encoder(p_batch, training=True)  # Shape: (batch_size, embedding_dim)
+    #         n_enc = self.encoder(n_batch, training=True)  # Shape: (batch_size, embedding_dim)
+    #         # Compute cosine similarities
+    #         pos_sim = tf.reduce_sum(tf.multiply(q_enc, p_enc), axis=1)  # Shape: (batch_size,)
+    #         neg_sim = tf.reduce_sum(tf.multiply(q_enc, n_enc), axis=1)  # Shape: (batch_size,)
+    #         # Ensure similarities are float32
+    #         pos_sim = tf.cast(pos_sim, tf.float32)
+    #         neg_sim = tf.cast(neg_sim, tf.float32)
+    #         # Compute loss with margin
+    #         margin = tf.cast(self.config.margin, tf.float32)
+    #         loss = tf.maximum(0.0, margin - pos_sim + neg_sim)
+    #     # Compute gradients and update encoder weights
+    #     gradients = tape.gradient(loss, self.encoder.pretrained.trainable_variables)
+    #     # Filter out None gradients (if any)
+    #     grads_and_vars = [
+    #         (g, v) for g, v in zip(gradients, self.encoder.pretrained.trainable_variables)
+    #         if g is not None
+    #     ]
+    #     if grads_and_vars:
+    #         self.optimizer.apply_gradients(grads_and_vars)
+    #     # Return mean loss
+    #     return tf.reduce_mean(loss)

dialogue_augmenter.py CHANGED Viewed

@@ -3,11 +3,9 @@ import numpy as np
 import torch
 import tensorflow as tf
 import tensorflow_hub as hub
-import re
 from pipeline_config import PipelineConfig
 from quality_metrics import QualityMetrics
 from paraphraser import Paraphraser
-from back_translator import BackTranslator
 import nlpaug.augmenter.word as naw
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
@@ -29,9 +27,12 @@ class DialogueAugmenter:
             print(f"Using device: {self.device}")
             if self.use_gpu:
                 print(f"GPU Device: {torch.cuda.get_device_name(0)}")
-        # Load base models
         self.quality_metrics = QualityMetrics(config)
         self.use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
         # Initialize augmentation models based on hardware
@@ -39,10 +40,6 @@ class DialogueAugmenter:
         # Initialize caches
         self.embedding_cache = {}
-        self.perplexity_cache = {}
-        # Compile regex patterns
-        self.spelling_pattern = re.compile(r'[a-zA-Z]{3,}')
         # GPU memory management if available
         if self.use_gpu:
@@ -57,25 +54,20 @@ class DialogueAugmenter:
     def _initialize_augmentation_models(self):
         """Initialize augmentation models with appropriate device settings"""
         # Advanced augmentation techniques
-        self.paraphraser = Paraphraser()
-        self.back_translator = BackTranslator()
         if self.use_gpu:
-            # Move models to GPU if available
             self.paraphraser.model = self.paraphraser.model.to(self.device)
-            self.back_translator.model_pivot_forward = self.back_translator.model_pivot_forward.to(self.device)
-            self.back_translator.model_pivot_backward = self.back_translator.model_pivot_backward.to(self.device)
-            self.back_translator.model_backward = self.back_translator.model_backward.to(self.device)
         # Basic augmentation techniques
         self.word_augmenter = naw.SynonymAug(aug_src='wordnet')
-        self.spelling_augmenter = naw.SpellingAug()
         self.augmenters = {
-            'advanced': [self.paraphraser, self.back_translator],
             'basic': [
                 ('synonym', self.word_augmenter),
-                ('spelling', self.spelling_augmenter)
             ]
         }
@@ -103,52 +95,46 @@ class DialogueAugmenter:
     def _quick_quality_check(self, variation: str, original: str) -> bool:
         """
-        Stricter preliminary quality check while maintaining reasonable pass rates
         """
         if self.config.debug:
             print(f"\nQuick check for variation: {variation}")
-        # Stricter length check
         orig_len = len(original.split())
         var_len = len(variation.split())
-        # For very short texts (1-3 words), still allow more variation
         if orig_len <= 3:
-            if var_len > orig_len * 3:  # Reduced from 4x to 3x
                 if self.config.debug:
                     print(f"Failed length check (short text): {var_len} vs {orig_len}")
                 return False
         else:
-            if var_len > orig_len * 2:  # Reduced from 3x to 2x
                 if self.config.debug:
                     print(f"Failed length check (long text): {var_len} vs {orig_len}")
                 return False
-        # Enhanced content check - more words in common
         stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'is', 'are', 'that', 'this', 'will', 'can'}
         orig_words = set(w.lower() for w in original.split() if w.lower() not in stop_words)
         var_words = set(w.lower() for w in variation.split() if w.lower() not in stop_words)
-        # Require more content word overlap
-        content_overlap = len(orig_words.intersection(var_words)) / len(orig_words) if orig_words else 0
-        if content_overlap < 0.3:  # Increased from no minimum to 30% overlap
             if self.config.debug:
-                print(f"Failed content check: overlap {content_overlap:.2f}")
-            return False
         if self.config.debug:
             print("Passed all quick checks")
         return True
-    def _compute_metrics_parallel(self, original: str, candidates: List[str]) -> List[Dict[str, float]]:
-        """Compute quality metrics for multiple candidates in parallel"""
-        with ThreadPoolExecutor(max_workers=4) as executor:
-            futures = [
-                executor.submit(self.quality_metrics.compute_metrics, original, candidate)
-                for candidate in candidates
-            ]
-            return [future.result() for future in futures]
     def _filter_variations_batch(self, variations: List[str], context: List[str], original_turn: str) -> List[str]:
         """
         Filter variations using batched computations with detailed logging
@@ -162,12 +148,17 @@ class DialogueAugmenter:
             print(f"Original turn: {original_turn}")
         words = original_turn.split()
         if len(words) < 3:
             if self.config.debug:
                 print("Short text detected, using predefined variations")
             short_text_variations = self._augment_short_text({'text': original_turn, 'speaker': ''})
             return [var['text'] for var in short_text_variations]
         # If this is the first turn (no context), be more lenient
         if not context:
             preliminary_filtered = variations
@@ -183,57 +174,85 @@ class DialogueAugmenter:
                     print(f"Passed quick check: {passed}")
                 if passed:
                     preliminary_filtered.append(var)
         if self.config.debug:
             print(f"Variations after quick check: {len(preliminary_filtered)}")
         if not preliminary_filtered:
             return []
         # Only use last turn for coherence
         recent_context = [context[-1]] if context else []
         context_text = ' '.join(recent_context) if recent_context else ''
-        # Even more lenient thresholds
-        min_similarity = 0.1  # Further reduced
-        min_coherence = 0.05  # Further reduced
         if context_text:
             if self.config.debug:
                 print(f"\nContext text: {context_text}")
-            all_texts = [context_text] + preliminary_filtered
             all_embeddings = self._compute_batch_embeddings(all_texts)
             context_embedding = all_embeddings[0]
             variation_embeddings = all_embeddings[1:]
             # Vectorized similarity computation
             context_similarities = cosine_similarity([context_embedding], variation_embeddings)[0]
             # Response coherence check
             if recent_context:
                 prev_embedding = self._compute_embedding(recent_context[-1])
                 response_coherence = cosine_similarity([prev_embedding], variation_embeddings)[0]
             else:
                 response_coherence = np.ones_like(context_similarities)
-            # Combined scoring with detailed logging
             filtered_variations = []
             for i, (variation, sim, coh) in enumerate(zip(
-                preliminary_filtered, context_similarities, response_coherence)):
-                # Use absolute values for scoring
                 combined_score = (
                     self.config.context_similarity_weight * abs(sim) +
                     self.config.response_coherence_weight * abs(coh)
                 )
                 if self.config.debug:
                     print(f"\nVariation: {variation}")
                     print(f"Context similarity: {sim:.3f}")
                     print(f"Response coherence: {coh:.3f}")
                     print(f"Combined score: {combined_score:.3f}")
                 # Accept if EITHER score is good enough
                 if (combined_score >= min_similarity or abs(coh) >= min_coherence):
                     filtered_variations.append(variation)
@@ -242,74 +261,71 @@ class DialogueAugmenter:
                 else:
                     if self.config.debug:
                         print("REJECTED")
                 # If we have enough variations, stop
                 if len(filtered_variations) >= self.config.max_variations_per_turn:
                     break
         else:
-            filtered_variations = preliminary_filtered[:self.config.max_variations_per_turn]
         if self.config.debug:
             print(f"\nFinal filtered variations: {len(filtered_variations)}")
         return filtered_variations
     def _generate_variations_progressive(self, text: str, needed: int) -> List[str]:
         """
-        Generate variations progressively until we have enough good ones
         """
         variations = set()
         if self.config.debug:
             print(f"\nAttempting to generate {needed} variations for text: {text}")
-        # Try advanced augmenters first
         for augmenter in self.augmenters['advanced']:
             if len(variations) >= needed:
                 break
             try:
                 if isinstance(augmenter, Paraphraser):
                     if self.config.debug:
                         print("Trying paraphrase augmentation...")
-                    new_vars = augmenter.paraphrase(text, num_return_sequences=needed-len(variations))
                     if self.config.debug:
                         print(f"Paraphraser generated {len(new_vars)} variations")
-                else:
-                    if self.config.debug:
-                        print("Trying back translation...")
-                    new_vars = [augmenter.back_translate(text)]
-                    if self.config.debug:
-                        print(f"Back translator generated {len(new_vars)} variations")
                 valid_vars = [v for v in new_vars if v.strip() and v != text]
                 variations.update(valid_vars)
                 if self.config.debug:
                     print(f"Current unique variations: {len(variations)}")
             except Exception as e:
                 print(f"Error in advanced augmentation: {str(e)}")
                 continue
         # Try basic augmenters if needed
         if len(variations) < needed:
             if self.config.debug:
                 print("Not enough variations, trying basic augmenters...")
             for aug_type, augmenter in self.augmenters['basic']:
                 if len(variations) >= needed:
                     break
                 try:
-                    if aug_type == 'spelling' and self._is_technical_or_formal_text(text):
-                        if self.config.debug:
-                            print("Skipping spelling augmentation for technical text")
-                        continue
                     if self.config.debug:
                         print(f"Trying {aug_type} augmentation...")
                     new_vars = augmenter.augment(text, n=2)
                     if isinstance(new_vars, list):
                         valid_vars = [v for v in new_vars if v.strip() and v != text]
@@ -317,21 +333,21 @@ class DialogueAugmenter:
                     else:
                         if new_vars.strip() and new_vars != text:
                             variations.add(new_vars)
                     if self.config.debug:
                         print(f"After {aug_type}, total variations: {len(variations)}")
                 except Exception as e:
                     print(f"Error in {aug_type} augmentation: {str(e)}")
                     continue
         variations_list = list(variations)
         if self.config.debug:
             print(f"Final number of variations generated: {len(variations_list)}")
             if not variations_list:
                 print("WARNING: No variations were generated!")
         return variations_list
     def augment_dialogue(self, dialogue: Dict) -> List[Dict]:
@@ -375,7 +391,8 @@ class DialogueAugmenter:
         # Generate combinations with sampling
         augmented_dialogues = self._generate_dialogue_combinations(
             dialogue['dialogue_id'],
-            turn_variations
         )
         # Add original dialogue
@@ -392,47 +409,201 @@ class DialogueAugmenter:
         return result
-    def _generate_dialogue_combinations(self, dialogue_id: str, turn_variations: List[List[Dict]]) -> List[Dict]:
         """
-        Generate dialogue combinations using sampling
         """
         augmented_dialogues = []
         used_combinations = set()
-        def generate_dialogues(current_turns=None, turn_index=0):
             if current_turns is None:
                 current_turns = []
-            if len(augmented_dialogues) >= self.config.augmentation_factor:
                 return
             if turn_index == len(turn_variations):
                 dialogue_fingerprint = " | ".join(turn['text'] for turn in current_turns)
                 if dialogue_fingerprint not in used_combinations:
                     used_combinations.add(dialogue_fingerprint)
-                    augmented_dialogues.append({
-                        'dialogue_id': f"{dialogue_id}_aug_{len(augmented_dialogues)}",
-                        'turns': current_turns.copy()
-                    })
                 return
-            variations = list(turn_variations[turn_index])
-            np.random.shuffle(variations)
-            for variation in variations[:self.config.max_sampled_variations]:
-                if len(augmented_dialogues) >= self.config.augmentation_factor:
                     return
                 current_turns.append(variation)
-                generate_dialogues(current_turns, turn_index + 1)
                 current_turns.pop()
         try:
-            generate_dialogues()
         except Exception as e:
             print(f"Error in dialogue generation: {str(e)}")
             return []
-        return augmented_dialogues
     def _is_dialogue_duplicate(self, dialogue1: Dict, dialogue2: Dict) -> bool:
         """
@@ -445,11 +616,9 @@ class DialogueAugmenter:
     def _augment_short_text(self, turn: Dict) -> List[Dict]:
         """
         Special handling for very short texts with predefined variations.
-        Args:
-            turn (Dict): Original dialogue turn
-        Returns:
-            List[Dict]: List of variations for the short text
         """
         text = turn['text']
         common_variations = {
@@ -483,71 +652,60 @@ class DialogueAugmenter:
                 'Fantastic!', 'Amazing!', 'Terrific!'
             ]
         }
-        # Try to find matching variations
         text_lower = text.lower().rstrip('!.,?')
         variations = []
-        # Check if text matches any of our predefined categories
         for key, predefined_vars in common_variations.items():
             if key in text_lower or text_lower in key:
                 variations.extend(predefined_vars)
-        # If no predefined variations found, generate simple variants
         if not variations:
-            # Add punctuation variations
             variations = [
-                text.rstrip('!.,?') + '!',
-                text.rstrip('!.,?') + '.',
-                text.rstrip('!.,?')
             ]
             # Add capitalization variations
-            variations.extend([
-                v.capitalize() for v in variations
-                if v.capitalize() not in variations
-            ])
-        # Filter variations for uniqueness and quality
         unique_variations = list(set(variations))
-        quality_variations = []
-        for var in unique_variations:
-            metrics = self.quality_metrics.compute_metrics(text, var)
-            quality_score = (
-                0.35 * metrics['semantic_similarity'] +
-                0.30 * (1.0 - metrics['perplexity'] / 100) +
-                0.15 * (1.0 - metrics['grammar_errors'] / 10) +
-                0.15 * metrics['content_preservation'] +
-                0.10 * metrics['type_token_ratio']
-            )
-            # More lenient quality threshold for short texts
-            if quality_score >= 0.5:  # Lower threshold for short texts
-                quality_variations.append(var)
-        # Ensure we have at least some variations
-        if not quality_variations:
-            quality_variations = [text]
-        # Return the variations with original speaker
-        return [{'speaker': turn['speaker'], 'text': v} for v in quality_variations[:self.config.augmentation_factor]]
-    def _is_technical_or_formal_text(self, text: str) -> bool:
-        """
-        Check if text is formal/technical and shouldn't have spelling variations.
-        """
-        formal_indicators = {
-            'technical_terms': {'api', 'config', 'database', 'server', 'system'},
-            'formal_phrases': {'please advise', 'regarding', 'furthermore', 'moreover'},
-            'professional_context': {'meeting', 'conference', 'project', 'deadline'}
-        }
-        text_lower = text.lower()
-        words = set(text_lower.split())
-        for category in formal_indicators.values():
-            if words.intersection(category):
-                return True
-        return False

 import torch
 import tensorflow as tf
 import tensorflow_hub as hub
 from pipeline_config import PipelineConfig
 from quality_metrics import QualityMetrics
 from paraphraser import Paraphraser
 import nlpaug.augmenter.word as naw
 from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
             print(f"Using device: {self.device}")
             if self.use_gpu:
                 print(f"GPU Device: {torch.cuda.get_device_name(0)}")
         self.quality_metrics = QualityMetrics(config)
+        self.semantic_similarity_threshold = 0.75
+        # Load model
         self.use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
         # Initialize augmentation models based on hardware
         # Initialize caches
         self.embedding_cache = {}
         # GPU memory management if available
         if self.use_gpu:
     def _initialize_augmentation_models(self):
         """Initialize augmentation models with appropriate device settings"""
         # Advanced augmentation techniques
+        self.paraphraser = Paraphraser()
         if self.use_gpu:
+            # Move model to GPU if available
             self.paraphraser.model = self.paraphraser.model.to(self.device)
         # Basic augmentation techniques
         self.word_augmenter = naw.SynonymAug(aug_src='wordnet')
         self.augmenters = {
+            'advanced': [
+                self.paraphraser,
+            ],
             'basic': [
                 ('synonym', self.word_augmenter),
             ]
         }
     def _quick_quality_check(self, variation: str, original: str) -> bool:
         """
+        Preliminary quality check while maintaining reasonable pass rates
         """
         if self.config.debug:
             print(f"\nQuick check for variation: {variation}")
         orig_len = len(original.split())
         var_len = len(variation.split())
+        # For very short texts (<= 3 words), still allow more variation
         if orig_len <= 3:
+            if var_len > orig_len * 3:
                 if self.config.debug:
                     print(f"Failed length check (short text): {var_len} vs {orig_len}")
                 return False
         else:
+            if var_len > orig_len * 2:
                 if self.config.debug:
                     print(f"Failed length check (long text): {var_len} vs {orig_len}")
                 return False
+        # Adjust content overlap check based on length
         stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'is', 'are', 'that', 'this', 'will', 'can'}
         orig_words = set(w.lower() for w in original.split() if w.lower() not in stop_words)
         var_words = set(w.lower() for w in variation.split() if w.lower() not in stop_words)
+        # If very short turn (less than 5 words), skip the content overlap check
+        if orig_len >= 5:
+            content_overlap = len(orig_words.intersection(var_words)) / len(orig_words) if orig_words else 0
+            if content_overlap < 0.2:
+                if self.config.debug:
+                    print(f"Failed content check: overlap {content_overlap:.2f}")
+                return False
+        else:
             if self.config.debug:
+                print("Short turn detected (<5 words), skipping content overlap check")
         if self.config.debug:
             print("Passed all quick checks")
         return True
     def _filter_variations_batch(self, variations: List[str], context: List[str], original_turn: str) -> List[str]:
         """
         Filter variations using batched computations with detailed logging
             print(f"Original turn: {original_turn}")
         words = original_turn.split()
+        orig_len = len(words)
+        # If very short text, consider adjusting thresholds
+        is_very_short = orig_len < 5
         if len(words) < 3:
             if self.config.debug:
                 print("Short text detected, using predefined variations")
             short_text_variations = self._augment_short_text({'text': original_turn, 'speaker': ''})
             return [var['text'] for var in short_text_variations]
         # If this is the first turn (no context), be more lenient
         if not context:
             preliminary_filtered = variations
                     print(f"Passed quick check: {passed}")
                 if passed:
                     preliminary_filtered.append(var)
         if self.config.debug:
             print(f"Variations after quick check: {len(preliminary_filtered)}")
         if not preliminary_filtered:
             return []
+        # Compute embeddings for original and variations
+        original_embedding = self._compute_embedding(original_turn)
+        variation_embeddings = self._compute_batch_embeddings(preliminary_filtered)
+        # Compute similarities
+        sims = cosine_similarity([original_embedding], variation_embeddings)[0]
+        # If very short turn, slightly lower the semantic similarity threshold
+        dynamic_sem_threshold = self.semantic_similarity_threshold
+        if is_very_short:
+            dynamic_sem_threshold = max(0.7, self.semantic_similarity_threshold - 0.05)
+        # Filter by semantic similarity threshold
+        refined_filtered = []
+        for var, sim in zip(preliminary_filtered, sims):
+            if sim >= dynamic_sem_threshold:
+                refined_filtered.append(var)
+            else:
+                if self.config.debug:
+                    print(f"Variation '{var}' discarded due to low semantic similarity: {sim:.3f}")
+        if not refined_filtered:
+            return []
+        # Relax context coherence thresholds further if desired
+        # We already have min_similarity = 0.1, min_coherence = 0.05
+        # Let's lower them slightly more if the turn is very short:
+        if is_very_short:
+            min_similarity = 0.05
+            min_coherence = 0.02
+        else:
+            min_similarity = 0.1
+            min_coherence = 0.05
         # Only use last turn for coherence
         recent_context = [context[-1]] if context else []
         context_text = ' '.join(recent_context) if recent_context else ''
         if context_text:
             if self.config.debug:
                 print(f"\nContext text: {context_text}")
+            all_texts = [context_text] + refined_filtered
             all_embeddings = self._compute_batch_embeddings(all_texts)
             context_embedding = all_embeddings[0]
             variation_embeddings = all_embeddings[1:]
             # Vectorized similarity computation
             context_similarities = cosine_similarity([context_embedding], variation_embeddings)[0]
             # Response coherence check
             if recent_context:
                 prev_embedding = self._compute_embedding(recent_context[-1])
                 response_coherence = cosine_similarity([prev_embedding], variation_embeddings)[0]
             else:
                 response_coherence = np.ones_like(context_similarities)
             filtered_variations = []
             for i, (variation, sim, coh) in enumerate(zip(
+                refined_filtered, context_similarities, response_coherence)):
                 combined_score = (
                     self.config.context_similarity_weight * abs(sim) +
                     self.config.response_coherence_weight * abs(coh)
                 )
                 if self.config.debug:
                     print(f"\nVariation: {variation}")
                     print(f"Context similarity: {sim:.3f}")
                     print(f"Response coherence: {coh:.3f}")
                     print(f"Combined score: {combined_score:.3f}")
                 # Accept if EITHER score is good enough
                 if (combined_score >= min_similarity or abs(coh) >= min_coherence):
                     filtered_variations.append(variation)
                 else:
                     if self.config.debug:
                         print("REJECTED")
                 # If we have enough variations, stop
                 if len(filtered_variations) >= self.config.max_variations_per_turn:
                     break
         else:
+            filtered_variations = refined_filtered[:self.config.max_variations_per_turn]
         if self.config.debug:
             print(f"\nFinal filtered variations: {len(filtered_variations)}")
         return filtered_variations
     def _generate_variations_progressive(self, text: str, needed: int) -> List[str]:
         """
+        Generate variations progressively until we have enough good ones.
+        Adjust paraphraser parameters for closer paraphrases as needed.
         """
         variations = set()
         if self.config.debug:
             print(f"\nAttempting to generate {needed} variations for text: {text}")
+        # Fine-tune paraphraser here if needed: fewer beams, less diversity already done
         for augmenter in self.augmenters['advanced']:
             if len(variations) >= needed:
                 break
             try:
                 if isinstance(augmenter, Paraphraser):
                     if self.config.debug:
                         print("Trying paraphrase augmentation...")
+                    new_vars = augmenter.paraphrase(
+                        text,
+                        num_return_sequences=needed-len(variations),
+                        device=self.device if self.use_gpu else None,
+                        num_beams=4,          # even fewer beams for more faithful paraphrases
+                        num_beam_groups=1,
+                        diversity_penalty=0.0
+                    )
                     if self.config.debug:
                         print(f"Paraphraser generated {len(new_vars)} variations")
                 valid_vars = [v for v in new_vars if v.strip() and v != text]
                 variations.update(valid_vars)
                 if self.config.debug:
                     print(f"Current unique variations: {len(variations)}")
             except Exception as e:
                 print(f"Error in advanced augmentation: {str(e)}")
                 continue
         # Try basic augmenters if needed
         if len(variations) < needed:
             if self.config.debug:
                 print("Not enough variations, trying basic augmenters...")
             for aug_type, augmenter in self.augmenters['basic']:
                 if len(variations) >= needed:
                     break
                 try:
                     if self.config.debug:
                         print(f"Trying {aug_type} augmentation...")
                     new_vars = augmenter.augment(text, n=2)
                     if isinstance(new_vars, list):
                         valid_vars = [v for v in new_vars if v.strip() and v != text]
                     else:
                         if new_vars.strip() and new_vars != text:
                             variations.add(new_vars)
                     if self.config.debug:
                         print(f"After {aug_type}, total variations: {len(variations)}")
                 except Exception as e:
                     print(f"Error in {aug_type} augmentation: {str(e)}")
                     continue
         variations_list = list(variations)
         if self.config.debug:
             print(f"Final number of variations generated: {len(variations_list)}")
             if not variations_list:
                 print("WARNING: No variations were generated!")
         return variations_list
     def augment_dialogue(self, dialogue: Dict) -> List[Dict]:
         # Generate combinations with sampling
         augmented_dialogues = self._generate_dialogue_combinations(
             dialogue['dialogue_id'],
+            turn_variations,
+            dialogue
         )
         # Add original dialogue
         return result
+    def _variation_score(self, original: str, variation: str) -> float:
+        """
+        Compute a single numeric score for a variation to guide selection.
+        You could use semantic similarity, content preservation, etc.
+        Higher is better.
+        """
+        metrics = self.quality_metrics.compute_metrics(original, variation)
+        # Example: Primarily semantic similarity, with a slight boost for content preservation
+        # Adjust as needed.
+        score = metrics['semantic_similarity'] * 0.7 + metrics['content_preservation'] * 0.3
+        return score
+    def _dialogue_quality_score(self, dialogue: Dict, original_dialogue: Dict) -> float:
         """
+        Compute a quality score for the entire augmented dialogue.
+        For example, average semantic similarity of turns to the original turns.
+        This is done after the dialogue is formed.
         """
+        original_texts = [t['text'] for t in original_dialogue['turns']]
+        aug_texts = [t['text'] for t in dialogue['turns']]
+        # Compute semantic similarity turn-by-turn and average it
+        scores = []
+        for orig, aug in zip(original_texts, aug_texts):
+            # Simple semantic similarity for scoring
+            emb_orig = self._compute_embedding(orig)
+            emb_aug = self._compute_embedding(aug)
+            sim = (emb_orig @ emb_aug) / (np.linalg.norm(emb_orig)*np.linalg.norm(emb_aug))
+            scores.append(sim)
+        # Could also incorporate diversity checks, content overlap, etc.
+        return float(np.mean(scores)) if scores else 0.0
+    def _generate_dialogue_combinations(self, dialogue_id: str, turn_variations: List[List[Dict]], original_dialogue: Dict) -> List[Dict]:
+        """
+        Generate dialogue combinations using a more controlled approach:
+        - Include the original turn as a fallback variation for each turn.
+        - Sort variations by a quality score.
+        - Ensure a balanced augmentation by requiring at least some turns to be augmented.
+        - Over-generate and then select top dialogues by quality.
+        """
+        # Over-generate factor: create more candidates than needed
+        over_generate_factor = self.config.augmentation_factor * 2
+        # Add the original turn as a fallback variation for each turn if not present
+        for i, turn_variants in enumerate(turn_variations):
+            original_turn_text = None
+            # Check if we previously stored original turn text with a marker or just use the original dialogue
+            # If you previously used "|ORIGINAL|" marker, handle it here. Otherwise, just get from original_dialogue.
+            original_turn_text = original_dialogue['turns'][i]['text']
+            # Add the original turn as a variation if not already included
+            if not any(v['text'] == original_turn_text for v in turn_variants):
+                turn_variants.append({
+                    'speaker': original_dialogue['turns'][i]['speaker'],
+                    'text': original_turn_text
+                })
+            # Sort variations by score
+            original_text = original_dialogue['turns'][i]['text']
+            turn_variants.sort(key=lambda v: self._variation_score(original_text, v['text']), reverse=True)
         augmented_dialogues = []
         used_combinations = set()
+        def generate_candidates(current_turns=None, turn_index=0):
             if current_turns is None:
                 current_turns = []
+            if len(augmented_dialogues) >= over_generate_factor:
                 return
             if turn_index == len(turn_variations):
+                # Completed a candidate dialogue
                 dialogue_fingerprint = " | ".join(turn['text'] for turn in current_turns)
                 if dialogue_fingerprint not in used_combinations:
                     used_combinations.add(dialogue_fingerprint)
+                    # Check if we have enough augmented turns
+                    aug_count = sum(1 for orig, curr in zip(original_dialogue['turns'], current_turns)
+                                    if orig['text'] != curr['text'])
+                    # Require at least half the turns to be augmented, for example
+                    if aug_count >= max(1, len(turn_variations)//2):
+                        augmented_dialogues.append({
+                            'dialogue_id': f"{dialogue_id}_aug_{len(augmented_dialogues)}",
+                            'turns': current_turns.copy()
+                        })
                 return
+            turn_candidates = turn_variations[turn_index]
+            # If no variations are available for this turn, let's just return without error.
+            # Normally, this shouldn't happen since we always add the original turn above.
+            if not turn_candidates:
+                # If you want to at least have the original turn, add it now:
+                original_text = original_dialogue['turns'][turn_index]['text']
+                turn_candidates.append({
+                    'speaker': original_dialogue['turns'][turn_index]['speaker'],
+                    'text': original_text
+                })
+            # After the fallback, if still empty for some reason, just return.
+            if not turn_candidates:
+                return
+            # Example strategy:
+            # 1. Always try the top variation (most semantically similar).
+            # 2. If available and allowed, pick a mid-ranked variation for diversity.
+            # 3. Include the original turn if not selected yet.
+            num_vars = min(self.config.max_sampled_variations, len(turn_candidates))
+            # Always include top variation
+            candidates_to_pick = [turn_candidates[0]]
+            # If we have more than 2 variations and can pick more, add a middle variation for diversity
+            if len(turn_candidates) > 2 and num_vars > 1:
+                mid_index = len(turn_candidates)//2
+                candidates_to_pick.append(turn_candidates[mid_index])
+            # If we still have room for another variation, try adding the original turn if not included
+            if num_vars > len(candidates_to_pick):
+                original_turn_text = original_dialogue['turns'][turn_index]['text']
+                orig_candidate = next((v for v in turn_candidates if v['text'] == original_turn_text), None)
+                if orig_candidate and orig_candidate not in candidates_to_pick:
+                    candidates_to_pick.append(orig_candidate)
+            # Shuffle candidates to produce different dialogues
+            np.random.shuffle(candidates_to_pick)
+            for variation in candidates_to_pick:
+                if len(augmented_dialogues) >= over_generate_factor:
                     return
                 current_turns.append(variation)
+                generate_candidates(current_turns, turn_index + 1)
                 current_turns.pop()
         try:
+            generate_candidates()
         except Exception as e:
             print(f"Error in dialogue generation: {str(e)}")
             return []
+        # Over-generated set of augmented dialogues is now available
+        # Let's score them and pick the top ones
+        scored_dialogues = []
+        for d in augmented_dialogues:
+            score = self._dialogue_quality_score(d, original_dialogue)
+            scored_dialogues.append((score, d))
+        scored_dialogues.sort(key=lambda x: x[0], reverse=True)
+        # Pick top `augmentation_factor` dialogues
+        final_dialogues = [d for _, d in scored_dialogues[:self.config.augmentation_factor]]
+        return final_dialogues
+    # def _generate_dialogue_combinations(self, dialogue_id: str, turn_variations: List[List[Dict]]) -> List[Dict]:
+    #     """
+    #     Generate dialogue combinations using sampling
+    #     """
+    #     augmented_dialogues = []
+    #     used_combinations = set()
+    #     def generate_dialogues(current_turns=None, turn_index=0):
+    #         if current_turns is None:
+    #             current_turns = []
+    #         if len(augmented_dialogues) >= self.config.augmentation_factor:
+    #             return
+    #         if turn_index == len(turn_variations):
+    #             dialogue_fingerprint = " | ".join(turn['text'] for turn in current_turns)
+    #             if dialogue_fingerprint not in used_combinations:
+    #                 used_combinations.add(dialogue_fingerprint)
+    #                 augmented_dialogues.append({
+    #                     'dialogue_id': f"{dialogue_id}_aug_{len(augmented_dialogues)}",
+    #                     'turns': current_turns.copy()
+    #                 })
+    #             return
+    #         variations = list(turn_variations[turn_index])
+    #         np.random.shuffle(variations)
+    #         for variation in variations[:self.config.max_sampled_variations]:
+    #             if len(augmented_dialogues) >= self.config.augmentation_factor:
+    #                 return
+    #             current_turns.append(variation)
+    #             generate_dialogues(current_turns, turn_index + 1)
+    #             current_turns.pop()
+    #     try:
+    #         generate_dialogues()
+    #     except Exception as e:
+    #         print(f"Error in dialogue generation: {str(e)}")
+    #         return []
+    #     return augmented_dialogues
     def _is_dialogue_duplicate(self, dialogue1: Dict, dialogue2: Dict) -> bool:
         """
     def _augment_short_text(self, turn: Dict) -> List[Dict]:
         """
         Special handling for very short texts with predefined variations.
+        If predefined variations are found, return them directly.
+        Otherwise, produce simple punctuation and capitalization variants.
+        Skip heavy quality checks for efficiency. These variations are safe and minimal.
         """
         text = turn['text']
         common_variations = {
                 'Fantastic!', 'Amazing!', 'Terrific!'
             ]
         }
         text_lower = text.lower().rstrip('!.,?')
+        # Check if text matches any predefined category
         variations = []
         for key, predefined_vars in common_variations.items():
             if key in text_lower or text_lower in key:
                 variations.extend(predefined_vars)
         if not variations:
+            # Generate simple punctuation and capitalization variations if no predefined match
+            base = text.rstrip('!.,?')
             variations = [
+                base + '!',
+                base + '.',
+                base
             ]
             # Add capitalization variations
+            capitalized = [v.capitalize() for v in variations if v.capitalize() not in variations]
+            variations.extend(capitalized)
+        # Ensure uniqueness
         unique_variations = list(set(variations))
+        # Directly return these variations, as they are minimal and trusted
+        # No further quality checks are needed
+        result_variations = unique_variations[:self.config.augmentation_factor]
+        return [{'speaker': turn['speaker'], 'text': v} for v in result_variations]
+    def process_batch(self, batch: List[Dict]) -> List[Dict]:
+        """Process multiple dialogues at once to maximize GPU utilization"""
+        results = []
+        # Pre-compute embeddings for all texts in batch
+        all_texts = []
+        text_to_embedding = {}
+        for dialogue in batch:
+            for turn in dialogue['turns']:
+                all_texts.append(turn['text'])
+        # Batch compute embeddings
+        if all_texts:
+            embeddings = self._compute_batch_embeddings(all_texts)
+            for text, embedding in zip(all_texts, embeddings):
+                self.embedding_cache[text] = embedding
+        # Process each dialogue using cached embeddings
+        for dialogue in batch:
+            try:
+                augmented = self.augment_dialogue(dialogue)
+                results.extend(augmented)
+            except Exception as e:
+                print(f"Error processing dialogue {dialogue.get('dialogue_id', 'unknown')}: {e}")
+                continue
+        return results

main.py CHANGED Viewed

@@ -59,13 +59,13 @@ def main():
         min_length=1,
         max_length=512,
         batch_size=32 if tf.config.list_physical_devices('GPU') else 16,
-        max_turns_per_dialogue=6,
-        max_variations_per_turn=3,
         max_sampled_variations=2,
         context_window_size=4,
         max_complexity_threshold=100,
         use_cache=False,
-        debug=False,
         allowed_speakers=['user', 'assistant'],
         required_fields=['dialogue_id', 'turns']
     )

         min_length=1,
         max_length=512,
         batch_size=32 if tf.config.list_physical_devices('GPU') else 16,
+        max_turns_per_dialogue=12,
+        max_variations_per_turn=4,
         max_sampled_variations=2,
         context_window_size=4,
         max_complexity_threshold=100,
         use_cache=False,
+        debug=True,
         allowed_speakers=['user', 'assistant'],
         required_fields=['dialogue_id', 'turns']
     )

paraphraser.py CHANGED Viewed

@@ -9,11 +9,18 @@ class Paraphraser:
         self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
         self.model.eval()
-    def paraphrase(self, text, num_return_sequences=5, num_beams=10, num_beam_groups=5, diversity_penalty=0.8):
         try:
             input_text = "paraphrase: " + text + " </s>"
             encoding = self.tokenizer.encode_plus(input_text, return_tensors="pt")
-            input_ids = encoding["input_ids"]
             outputs = self.model.generate(
                 input_ids=input_ids,
@@ -24,7 +31,11 @@ class Paraphraser:
                 diversity_penalty=diversity_penalty,
                 early_stopping=True
             )
-            paraphrases = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
             return paraphrases
         except Exception as e:
             print(f"Error in paraphrasing: {e}")

         self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
         self.model.eval()
+    def paraphrase(self, text, num_return_sequences=5, num_beams=5,
+                  num_beam_groups=1, diversity_penalty=0.0, device=None):
         try:
             input_text = "paraphrase: " + text + " </s>"
             encoding = self.tokenizer.encode_plus(input_text, return_tensors="pt")
+            # Move input tensors to specified device if provided
+            if device is not None:
+                input_ids = encoding["input_ids"].to(device)
+                self.model = self.model.to(device)
+            else:
+                input_ids = encoding["input_ids"]
             outputs = self.model.generate(
                 input_ids=input_ids,
                 diversity_penalty=diversity_penalty,
                 early_stopping=True
             )
+            # Move outputs back to CPU for tokenizer decoding
+            outputs = outputs.cpu() if device is not None else outputs
+            paraphrases = [self.tokenizer.decode(output, skip_special_tokens=True)
+                          for output in outputs]
             return paraphrases
         except Exception as e:
             print(f"Error in paraphrasing: {e}")

pipeline_config.py CHANGED Viewed

@@ -30,7 +30,6 @@ class PipelineConfig:
     grammar_error_threshold: int = 2
     rouge1_f1_threshold: float = 0.30
     rouge2_f1_threshold: float = 0.15
-    perplexity_threshold: float = 50.0
     # Response coherence thresholds
     min_response_coherence: float = 0.3

     grammar_error_threshold: int = 2
     rouge1_f1_threshold: float = 0.30
     rouge2_f1_threshold: float = 0.15
     # Response coherence thresholds
     min_response_coherence: float = 0.3

processing_pipeline.py CHANGED Viewed

@@ -11,7 +11,6 @@ from pipeline_config import PipelineConfig
 from dialogue_augmenter import DialogueAugmenter
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-from concurrent.futures import ProcessPoolExecutor
 from typing import Set
 class ProcessingPipeline:
@@ -33,7 +32,11 @@ class ProcessingPipeline:
         self.use_gpu = torch.cuda.is_available()
         self.batch_size = 32 if self.use_gpu else 8
         self.use_multiprocessing = not self.use_gpu
         if self.config.debug:
             print(f"ProcessingPipeline initialized with:")
             print(f"- GPU available: {self.use_gpu}")
@@ -75,7 +78,7 @@ class ProcessingPipeline:
                         text_to_dialogue_map[turn['text']] = dialogue['dialogue_id']
                 # Batch process embeddings
-                embeddings = self.augmenter._compute_batch_embeddings(all_texts)
                 # Process dialogues with cached embeddings
                 for dialogue in batch:
@@ -89,16 +92,37 @@ class ProcessingPipeline:
             print(f"Error processing batch: {str(e)}")
         return results
     def combine_results(self) -> Path:
-        """Combine all batch files into final output"""
         all_results = []
-        batch_files = sorted(self.output_dir.glob("batch_*.json"))
-        print(f"Combining {len(batch_files)} batch files...")
-        for batch_file in tqdm(batch_files):
-            with open(batch_file, 'r') as f:
-                batch_data = json.load(f)
-                all_results.extend(batch_data)
         # Save combined results
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -137,12 +161,13 @@ class ProcessingPipeline:
             current_position = processed_count + batch_num + len(batch)
             total_progress = (current_position / total_dialogues) * 100
-            batch_progress = (batch_num + 1) / ((len(remaining_dialogues) + self.batch_size - 1) // self.batch_size) * 100
-            print(f"\rProgress: {current_position}/{total_dialogues} dialogues "
-                f"({total_progress:.1f}% complete) - "
-                f"Batch {batch_num//self.batch_size + 1} of "
-                f"{(len(remaining_dialogues) + self.batch_size - 1) // self.batch_size}", end="")
             # Process batch
             batch_results = self._process_batch(batch)
@@ -152,20 +177,37 @@ class ProcessingPipeline:
                 batch_ids = {d['dialogue_id'] for d in batch}
                 processed_ids.update(batch_ids)
                 self._update_checkpoint(processed_ids)
         print("\n" + "-" * 50)
         print("Processing complete. Combining results...")
         return self.combine_results()
     def cleanup(self):
-        """Clean up intermediate batch files after successful processing"""
         batch_files = list(self.output_dir.glob("batch_*.json"))
         for file in batch_files:
             try:
                 file.unlink()
             except Exception as e:
                 print(f"Error deleting {file}: {e}")
         if self.checkpoint_file.exists():
             try:
                 self.checkpoint_file.unlink()
@@ -276,4 +318,4 @@ class ProcessingPipeline:
         """
         data_str = json.dumps(data, sort_keys=True)
         hash_value = hashlib.md5(data_str.encode()).hexdigest()
-        return self.cache_dir / f"cache_{hash_value}.pkl"

 from dialogue_augmenter import DialogueAugmenter
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Set
 class ProcessingPipeline:
         self.use_gpu = torch.cuda.is_available()
         self.batch_size = 32 if self.use_gpu else 8
         self.use_multiprocessing = not self.use_gpu
+        # Counters for grouping batches
+        self.batch_counter = 0        # Count batches since last group combine
+        self.batch_group_number = 0   # How many groups have been created
         if self.config.debug:
             print(f"ProcessingPipeline initialized with:")
             print(f"- GPU available: {self.use_gpu}")
                         text_to_dialogue_map[turn['text']] = dialogue['dialogue_id']
                 # Batch process embeddings
+                self.augmenter._compute_batch_embeddings(all_texts)
                 # Process dialogues with cached embeddings
                 for dialogue in batch:
             print(f"Error processing batch: {str(e)}")
         return results
+    def _combine_intermediate_batches(self):
+        """
+        Combine all current batch_*.json files into a single batch_group_XXXX.json file,
+        then remove the batch_*.json files.
+        """
+        batch_files = sorted(self.output_dir.glob("batch_*.json"))
+        if not batch_files:
+            return None  # No files to combine
+        combined_data = []
+        for bf in batch_files:
+            with open(bf, 'r') as f:
+                combined_data.extend(json.load(f))
+            bf.unlink()  # Remove the individual batch file after reading
+        self.batch_group_number += 1
+        group_file = self.output_dir / f"batch_group_{self.batch_group_number:04d}.json"
+        with open(group_file, 'w') as f:
+            json.dump(combined_data, f)
+        return group_file
     def combine_results(self) -> Path:
+        """Combine all batch_group_*.json files into final output"""
         all_results = []
+        group_files = sorted(self.output_dir.glob("batch_group_*.json"))
+        print(f"Combining {len(group_files)} group files...")
+        for group_file in tqdm(group_files):
+            with open(group_file, 'r') as f:
+                group_data = json.load(f)
+                all_results.extend(group_data)
         # Save combined results
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             current_position = processed_count + batch_num + len(batch)
             total_progress = (current_position / total_dialogues) * 100
+            print('\033[K', end='')
+            print(f"Processing: {current_position}/{total_dialogues} dialogues "
+                f"({total_progress:.1f}% complete)")
+            print(f"Current batch: {batch_num//self.batch_size + 1} of "
+                f"{(len(remaining_dialogues) + self.batch_size - 1) // self.batch_size}")
+            print("-" * 50)
             # Process batch
             batch_results = self._process_batch(batch)
                 batch_ids = {d['dialogue_id'] for d in batch}
                 processed_ids.update(batch_ids)
                 self._update_checkpoint(processed_ids)
+            # Increment batch counter and combine if needed
+            self.batch_counter += 1
+            if self.batch_counter == 25:
+                # Combine these 25 batches into a group file
+                self._combine_intermediate_batches()
+                self.batch_counter = 0  # Reset counter after grouping
+        # If there are leftover batches less than 25
+        # combine them into one final group file
+        if self.batch_counter > 0:
+            self._combine_intermediate_batches()
+            self.batch_counter = 0
         print("\n" + "-" * 50)
         print("Processing complete. Combining results...")
         return self.combine_results()
     def cleanup(self):
+        """Clean up intermediate files after successful processing"""
+        # Clean up any leftover batch files (should not exist if logic is correct)
         batch_files = list(self.output_dir.glob("batch_*.json"))
         for file in batch_files:
             try:
                 file.unlink()
             except Exception as e:
                 print(f"Error deleting {file}: {e}")
+        # We can also remove batch_group_*.json if desired after final combine
+        # but that might not be necessary if we want to keep them.
         if self.checkpoint_file.exists():
             try:
                 self.checkpoint_file.unlink()
         """
         data_str = json.dumps(data, sort_keys=True)
         hash_value = hashlib.md5(data_str.encode()).hexdigest()
+        return self.cache_dir / f"cache_{hash_value}.pkl"

quality_metrics.py CHANGED Viewed

@@ -1,129 +1,47 @@
-import torch
-import tensorflow as tf
 import tensorflow_hub as hub
-from transformers import GPT2TokenizerFast, GPT2LMHeadModel
-import language_tool_python
-from rouge_score import rouge_scorer
 import spacy
 from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
 from typing import Dict
 from pipeline_config import PipelineConfig
 class QualityMetrics:
     """
-    Measure augmented text quality
     """
     def __init__(self, config: PipelineConfig):
         self.config = config
-        # Semantic similarity
         self.use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
-        # Fluency metrics
-        self.tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
-        self.model = GPT2LMHeadModel.from_pretrained('gpt2')
-        self.model.eval()
-        # Grammar
-        self.language_tool = language_tool_python.LanguageTool('en-US')
-        # Lexical similarity
-        self.rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
-        # Diversity
-        self.nlp = spacy.load('en_core_web_sm')
-    def compute_perplexity(self, text):
-        try:
-            encodings = self.tokenizer(text, return_tensors='pt')
-            input_ids = encodings['input_ids']
-            with torch.no_grad():
-                outputs = self.model(input_ids, labels=input_ids)
-                loss = outputs.loss
-                perplexity = torch.exp(loss)
-                return perplexity.item()
-        except Exception as e:
-            print(f"Error computing perplexity for text '{text}': {e}")
-            return float('inf')  # High perplexity value == poor quality
     def compute_semantic_similarity(self, text1: str, text2: str) -> float:
-        """
-        Compute semantic similarity between two texts using the Universal Sentence Encoder.
-        Args:
-            text1 (str): First text
-            text2 (str): Second text
-        Returns:
-            float: Cosine similarity score between the two texts (0-1)
-        """
         embeddings = self.use_model([text1, text2])
         emb1, emb2 = embeddings[0].numpy(), embeddings[1].numpy()
         return cosine_similarity([emb1], [emb2])[0][0]
     def compute_metrics(self, original: str, augmented: str) -> Dict[str, float]:
-        """
-        Compute quality metrics
-        """
         metrics = {}
-        # 1. Semantic Preservation
         embeddings = self.use_model([original, augmented])
         emb_orig, emb_aug = embeddings[0].numpy(), embeddings[1].numpy()
         metrics['semantic_similarity'] = cosine_similarity([emb_orig], [emb_aug])[0][0]
-        # 2. Fluency & Naturalness
-        metrics['perplexity'] = self.compute_perplexity(augmented)
-        metrics['grammar_errors'] = len(self.language_tool.check(augmented))
-        # 3. Lexical Diversity
         doc_orig = self.nlp(original)
         doc_aug = self.nlp(augmented)
-        # Type-token ratio with safety check
         aug_tokens = [token.text.lower() for token in doc_aug]
         metrics['type_token_ratio'] = len(set(aug_tokens)) / max(len(aug_tokens), 1)
-        # Content word overlap with safety checks
-        orig_content = set([token.text.lower() for token in doc_orig if not token.is_stop])
-        aug_content = set([token.text.lower() for token in doc_aug if not token.is_stop])
-        # Safety check for empty content sets
         if len(orig_content) == 0:
             metrics['content_preservation'] = 1.0 if len(aug_content) == 0 else 0.0
         else:
             metrics['content_preservation'] = len(orig_content.intersection(aug_content)) / len(orig_content)
-        # 4. Structural Preservation
-        rouge_scores = self.rouge.score(original, augmented)
-        metrics['rouge1_f1'] = rouge_scores['rouge1'].fmeasure
-        metrics['rouge2_f1'] = rouge_scores['rouge2'].fmeasure
-        metrics['rougeL_f1'] = rouge_scores['rougeL'].fmeasure
-        # 5. Length Preservation with safety check
         orig_words = len(original.split())
         aug_words = len(augmented.split())
         metrics['length_ratio'] = aug_words / max(orig_words, 1)
-        return metrics
-    def meets_quality_threshold(self, metrics: Dict[str, float]) -> bool:
-        """
-        Enhanced quality threshold checking
-        """
-        # Core quality checks
-        basic_quality = (
-            metrics['perplexity'] <= self.config.perplexity_threshold and
-            metrics['semantic_similarity'] >= self.config.semantic_similarity_threshold and
-            metrics['grammar_errors'] <= self.config.grammar_error_threshold
-        )
-        # Length preservation check
-        length_ok = 0.6 <= metrics['length_ratio'] <= 1.4
-        # Diversity check
-        diversity_ok = metrics['type_token_ratio'] >= 0.4
-        # Content preservation check
-        content_ok = metrics['content_preservation'] >= 0.6
-        return all([basic_quality, length_ok, diversity_ok, content_ok])

 import tensorflow_hub as hub
 import spacy
 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Dict
 from pipeline_config import PipelineConfig
 class QualityMetrics:
     """
+    Quality metrics focusing on semantic similarity and basic lexical stats.
     """
     def __init__(self, config: PipelineConfig):
         self.config = config
         self.use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
+        self.nlp = spacy.load('en_core_web_md')
     def compute_semantic_similarity(self, text1: str, text2: str) -> float:
         embeddings = self.use_model([text1, text2])
         emb1, emb2 = embeddings[0].numpy(), embeddings[1].numpy()
         return cosine_similarity([emb1], [emb2])[0][0]
     def compute_metrics(self, original: str, augmented: str) -> Dict[str, float]:
         metrics = {}
+        # Semantic similarity
         embeddings = self.use_model([original, augmented])
         emb_orig, emb_aug = embeddings[0].numpy(), embeddings[1].numpy()
         metrics['semantic_similarity'] = cosine_similarity([emb_orig], [emb_aug])[0][0]
+        # Lexical diversity & content preservation
         doc_orig = self.nlp(original)
         doc_aug = self.nlp(augmented)
         aug_tokens = [token.text.lower() for token in doc_aug]
         metrics['type_token_ratio'] = len(set(aug_tokens)) / max(len(aug_tokens), 1)
+        orig_content = {token.text.lower() for token in doc_orig if not token.is_stop}
+        aug_content = {token.text.lower() for token in doc_aug if not token.is_stop}
         if len(orig_content) == 0:
             metrics['content_preservation'] = 1.0 if len(aug_content) == 0 else 0.0
         else:
             metrics['content_preservation'] = len(orig_content.intersection(aug_content)) / len(orig_content)
+        # Length ratio
         orig_words = len(original.split())
         aug_words = len(augmented.split())
         metrics['length_ratio'] = aug_words / max(orig_words, 1)
+        return metrics

readme.md CHANGED Viewed

@@ -11,14 +11,16 @@ This package automatically downloads the following models during installation:
 - Universal Sentence Encoder v4 (TensorFlow Hub)
 - ChatGPT Paraphraser T5-base
 - Helsinki-NLP translation models (en-de, de-es, es-en)
-- GPT-2 (for perplexity scoring)
-- spaCy en_core_web_sm
 - nltk wordnet and averaged_perceptron_tagger_eng models
 ## Install package
 pip install -e .
 ## Description
 This Python script demonstrates a complete pipeline for dialogue augmentation, including validation, optimization, and data augmentation.

 - Universal Sentence Encoder v4 (TensorFlow Hub)
 - ChatGPT Paraphraser T5-base
 - Helsinki-NLP translation models (en-de, de-es, es-en)
+- spaCy en_core_web_sm, eng_core_web_md
 - nltk wordnet and averaged_perceptron_tagger_eng models
 ## Install package
 pip install -e .
+On Linux with Cuda/GPU:
+pip install faiss-gpu>=1.7.0
 ## Description
 This Python script demonstrates a complete pipeline for dialogue augmentation, including validation, optimization, and data augmentation.

requirements.txt CHANGED Viewed

@@ -1,12 +1,14 @@
-spacy>=3.0.0  # Text processing and tokenization
 numpy>=1.19.0  # General numerical computation
-tqdm>=4.64.0  # Progress bar
-torch>=1.10.0  # PyTorch, for deep learning
-tensorflow>=2.6.0  # TensorFlow, for deep learning
-tensorflow-hub>=0.12.0  # Pretrained model hub for TensorFlow
-transformers>=4.21.0  # Hugging Face Transformers library
-rouge-score>=0.1.2  # ROUGE metric for evaluation
-language-tool-python>=2.7.1  # Grammar checking and text correction
 scikit-learn>=1.0.0  # Machine learning tools
-nlpaug>=1.1.0  # Data augmentation for NLP
-nltk>=3.6.0  # Natural language toolkit

+nlpaug>=1.1.0  # Data augmentation for NLP
+nltk>=3.6.0  # Natural language toolkit
 numpy>=1.19.0  # General numerical computation
 scikit-learn>=1.0.0  # Machine learning tools
+sacremoses>=0.0.53   # Required for some HuggingFace models
+sentencepiece>=0.1.99  # Required for HuggingFace transformers
+spacy>=3.0.0  # Text processing and tokenization
+tensorflow>=2.13.0  # TensorFlow, for deep learning
+tensorflow-hub>=0.12.0  # Pretrained model hub for TensorFlow
+tokenizers>=0.13.0   # Required for HuggingFace transformers
+torch>=2.0.0  # PyTorch, for deep learning
+tqdm>=4.64.0  # Progress bar
+transformers>=4.30.0  # Hugging Face Transformers library
+faiss-cpu>=1.7.0  # Required for Facebook AI Similarity Search

response_quality_checker.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import numpy as np
+from typing import List, Tuple, Dict, Any
+from sklearn.metrics.pairwise import cosine_similarity
+from chatbot4 import RetrievalChatbot
+class ResponseQualityChecker:
+    """Handles quality checking and confidence scoring for chatbot responses."""
+    def __init__(
+        self,
+        chatbot: RetrievalChatbot,
+        confidence_threshold: float = 0.5,
+        diversity_threshold: float = 0.1,
+        min_response_length: int = 3,
+        max_similarity_ratio: float = 0.9
+    ):
+        self.confidence_threshold = confidence_threshold
+        self.diversity_threshold = diversity_threshold
+        self.min_response_length = min_response_length
+        self.max_similarity_ratio = max_similarity_ratio
+        self.chatbot = chatbot
+    def check_response_quality(
+        self,
+        query: str,
+        responses: List[Tuple[str, float]]
+    ) -> Dict[str, Any]:
+        """
+        Evaluate the quality of the responses based on various metrics.
+        """
+        # Calculate diversity based on the responses themselves
+        diversity = self.calculate_diversity(responses)
+        # Calculate relevance based on some criteria
+        relevance = self.calculate_relevance(query, responses)
+        # Calculate length scores for each response
+        length_scores = [self._calculate_length_score(response) for response, _ in responses]
+        avg_length_score = np.mean(length_scores) if length_scores else 0.0
+        # Extract similarity scores
+        similarity_scores = [score for _, score in responses]
+        # Calculate score gap
+        score_gap = self._calculate_score_gap(similarity_scores, top_n=3)
+        # Aggregate metrics
+        metrics = {
+            'top_score': similarity_scores[0] if similarity_scores else 0.0,
+            'response_diversity': diversity,
+            'query_response_relevance': relevance,
+            'response_length_score': avg_length_score,
+            'top_3_score_gap': score_gap
+        }
+        # Determine overall confidence
+        is_confident = self._determine_confidence(metrics)
+        return {
+            'diversity': diversity,
+            'relevance': relevance,
+            'is_confident': is_confident,
+            'top_score': metrics['top_score'],
+            'response_diversity': metrics['response_diversity'],
+            'query_response_relevance': metrics['query_response_relevance'],
+            'response_length_score': metrics['response_length_score'],
+            'top_3_score_gap': metrics['top_3_score_gap']
+        }
+    def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
+        """
+        Calculate diversity as the average pairwise similarity between responses.
+        Lower similarity indicates higher diversity.
+        """
+        if not responses:
+            return 0.0
+        # Encode responses
+        embeddings = [self.encode_text(response) for response, _ in responses]
+        if len(embeddings) < 2:
+            return 1.0  # Maximum diversity
+        # Compute pairwise cosine similarity
+        similarity_matrix = cosine_similarity(embeddings)
+        # Exclude diagonal
+        sum_similarities = np.sum(similarity_matrix) - len(responses)
+        num_pairs = len(responses) * (len(responses) - 1)
+        avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0
+        diversity_score = 1 - avg_similarity  # Higher value indicates more diversity
+        return diversity_score
+    def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
+        """
+        Calculate relevance as the average similarity between the query and each response.
+        """
+        if not responses:
+            return 0.0
+        # Encode query
+        query_embedding = self.encode_query(query)
+        # Encode responses
+        response_embeddings = [self.encode_text(response) for response, _ in responses]
+        # Compute cosine similarity
+        similarities = cosine_similarity([query_embedding], response_embeddings)[0]
+        avg_relevance = np.mean(similarities) if similarities.size > 0 else 0.0
+        return avg_relevance
+    def _calculate_length_score(self, response: str) -> float:
+        """Score based on response length appropriateness."""
+        length = len(response.split())
+        if length < self.min_response_length:
+            return length / self.min_response_length
+        return 1.0
+    def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
+        """
+        Calculate the average gap between the top N scores.
+        Args:
+            scores (List[float]): List of similarity scores.
+            top_n (int): Number of top scores to consider.
+        Returns:
+            float: Average score gap.
+        """
+        if len(scores) < top_n + 1:
+            return 0.0
+        gaps = [scores[i] - scores[i + 1] for i in range(top_n)]
+        avg_gap = np.mean(gaps)
+        return avg_gap
+    def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
+        """
+        Determine if we're confident enough in the response.
+        Returns:
+            bool: True if we should use this response, False if we should abstain
+        """
+        conditions = [
+            metrics['top_score'] >= self.confidence_threshold,
+            metrics['response_diversity'] >= self.diversity_threshold,
+            metrics['response_length_score'] >= 0.8,
+            metrics['query_response_relevance'] >= 0.3,  # was 0.5
+            metrics['top_3_score_gap'] >= 0.05  # was 0.1
+        ]
+        return all(conditions)
+    def encode_text(self, text: str) -> np.ndarray:
+        # 1) Turn text into a list if your encode_responses() expects a list.
+        # 2) Then call the method from the chatbot to get the embedding.
+        embedding_tensor = self.chatbot.encode_responses([text])  # returns tf.Tensor of shape (1, emb_dim)
+        embedding = embedding_tensor.numpy()[0].astype('float32')  # shape: (emb_dim,)
+        embedding = embedding / np.linalg.norm(embedding) if np.linalg.norm(embedding) > 0 else embedding
+        return embedding
+    def encode_query(self, query: str) -> np.ndarray:
+        embedding_tensor = self.chatbot.encode_query(query)  # returns tf.Tensor of shape (1, emb_dim)
+        embedding = embedding_tensor.numpy()[0].astype('float32')  # shape: (emb_dim,)
+        embedding = embedding / np.linalg.norm(embedding) if np.linalg.norm(embedding) > 0 else embedding
+        return embedding

run_model.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import json
+import glob
+import os
+from chatbot import RetrievalChatbot
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+def load_training_data(data_directory: str) -> list:
+    """Load and combine dialogue data from multiple JSON files."""
+    all_dialogues = []
+    # Get all json files matching the pattern
+    pattern = os.path.join(data_directory, "batch_*.json")
+    json_files = sorted(glob.glob(pattern))
+    print(f"Found {len(json_files)} batch files")
+    for file_path in json_files:
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                batch_dialogues = json.load(f)
+                all_dialogues.extend(batch_dialogues)
+                print(f"Loaded {len(batch_dialogues)} dialogues from {os.path.basename(file_path)}")
+        except Exception as e:
+            print(f"Error loading {file_path}: {str(e)}")
+    print(f"Total dialogues loaded: {len(all_dialogues)}")
+    return all_dialogues
+def plot_training_history(train_losses, val_losses):
+    # Plot training and validation loss
+    plt.figure()
+    plt.plot(train_losses, label='Train Loss')
+    plt.plot(val_losses, label='Val Loss')
+    plt.xlabel('Epoch')
+    plt.ylabel('Triplet Loss')
+    plt.legend()
+    plt.show()
+dialogues = load_training_data('processed_outputs')
+# Initialize the chatbot
+chatbot = RetrievalChatbot(
+    vocab_size=10000,
+    max_sequence_length=80,
+    embedding_dim=256,
+    lstm_units=256,
+    num_attention_heads=8,
+    margin=0.3
+)
+# Prepare the dataset for triplet training
+q_pad, p_pad, n_pad = chatbot.prepare_dataset(dialogues, neg_samples_per_pos=3)
+# Train with triplet loss
+train_losses, val_losses = chatbot.train_with_triplet_loss(
+    q_pad, p_pad, n_pad,
+    epochs=1,
+    batch_size=32,
+    validation_split=0.2
+)
+plot_training_history(train_losses, val_losses)
+# After training, test prediction
+response_candidates = [turn['text'] for d in dialogues for turn in d['turns'] if turn['speaker'] == 'assistant']
+# Test retrieval
+test_query = "I'd like a recommendation for a Korean restaurant in NYC."
+top_responses = chatbot.retrieve_top_n(test_query, response_candidates, top_n=5)
+print("Top responses:")
+for resp, score in top_responses:
+    print(f"Score: {score:.4f} - {resp}")
+# Single-turn validation:
+test_queries = [
+    "I want to book a Korean restaurant in NYC.",
+    "Can I get two tickets for 'What Men Want'?",
+    "What's the best time to watch the movie today?"
+]
+for query in test_queries:
+    top_responses = chatbot.retrieve_top_n(query, response_candidates, top_n=3)
+    print(f"\nQuery: {query}")
+    for resp, score in top_responses:
+        print(f"Score: {score:.4f} - {resp}")
+# Multi-turn conversation:
+multi_turn_history = []
+def update_context(multi_turn_history, query, response, max_context_turns=3):
+    multi_turn_history.append((query, response))
+    if len(multi_turn_history) > max_context_turns:
+        multi_turn_history.pop(0)
+def get_context_enhanced_query(multi_turn_history, query):
+    if not multi_turn_history:
+        return query
+    context = " ".join([f"User: {q} Assistant: {r}" for q, r in multi_turn_history])
+    return f"{context} User: {query}"
+conversation_queries = [
+    "I'd like to watch a movie tonight.",
+    "Is there a showing of 'What Men Want'?",
+    "What time is the last show?",
+    "Can I get two tickets?"
+]
+for query in conversation_queries:
+    context_query = get_context_enhanced_query(multi_turn_history, query)
+    top_responses = chatbot.retrieve_top_n(context_query, response_candidates, top_n=3)
+    best_response = top_responses[0][0]
+    print(f"\nUser: {query}\nAssistant: {best_response}")
+    update_context(multi_turn_history, query, best_response)
+#queries, responses, labels = chatbot.prepare_dataset(dialogues, neg_samples_per_pos=3)
+#train_dialogues, val_dialogues = train_test_split(dialogues, test_size=0.2, random_state=20)
+#query_train, query_val, response_train, response_val, labels_train, labels_val = train_test_split(queries, responses, labels, test_size=0.2, random_state=20)
+# chatbot.model.compile(
+#     optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
+#     loss='binary_crossentropy',
+#     metrics=['accuracy']
+# )
+# # Train the model with early stopping to prevent overfitting
+# callbacks = [
+#     tf.keras.callbacks.EarlyStopping(
+#         monitor='val_loss',
+#         patience=3,
+#         restore_best_weights=True
+#     ),
+#     tf.keras.callbacks.ReduceLROnPlateau(
+#         monitor='val_loss',
+#         factor=0.5,
+#         patience=2,
+#         min_lr=1e-6,
+#         verbose=1
+#     ),
+#     tf.keras.callbacks.ModelCheckpoint(
+#         'chatbot_model.keras',
+#         monitor='val_loss',
+#         save_best_only=True
+#     )
+# ]
+# history = chatbot.model.fit(
+#     {'query_input': query_train, 'response_input': response_train},
+#     labels_train,
+#     validation_data=({'query_input': query_val, 'response_input': response_val}, labels_val),
+#     epochs=5,
+#     batch_size=32,
+#     callbacks=callbacks
+# )

run_model2.py ADDED Viewed

	@@ -0,0 +1,340 @@

+from chatbot2 import RetrievalChatbot, ChatbotConfig
+import os
+import json
+import glob
+import matplotlib.pyplot as plt
+import logging
+from pathlib import Path
+from typing import List, Dict, Optional, Any, Tuple
+import numpy as np
+from datetime import datetime
+from response_quality_checker import ResponseQualityChecker
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def load_training_data(data_directory: str, debug_samples: Optional[int] = None) -> list:
+    """
+    Load and combine dialogue data from multiple JSON files.
+    Args:
+        data_directory: Directory containing the dialogue files
+        debug_samples: If set, only load this many dialogues for debugging
+    """
+    all_dialogues = []
+    data_directory = Path(data_directory)
+    # Get all json files matching the pattern
+    pattern = "batch_*.json"
+    json_files = sorted(data_directory.glob(pattern))
+    logger.info(f"Found {len(json_files)} batch files")
+    if debug_samples:
+        logger.info(f"Debug mode: Will load up to {debug_samples} dialogues")
+    for file_path in json_files:
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                batch_dialogues = json.load(f)
+                # If in debug mode, only take what we need from this batch
+                if debug_samples is not None:
+                    remaining_samples = debug_samples - len(all_dialogues)
+                    if remaining_samples <= 0:
+                        break
+                    batch_dialogues = batch_dialogues[:remaining_samples]
+                all_dialogues.extend(batch_dialogues)
+                logger.info(f"Loaded {len(batch_dialogues)} dialogues from {file_path.name}")
+                # If we've reached our debug sample limit, stop loading
+                if debug_samples is not None and len(all_dialogues) >= debug_samples:
+                    logger.info(f"Debug mode: Reached {debug_samples} samples, stopping load")
+                    break
+        except Exception as e:
+            logger.error(f"Error loading {file_path}: {str(e)}")
+    total_loaded = len(all_dialogues)
+    if debug_samples:
+        logger.info(f"Debug mode: Loaded {total_loaded}/{debug_samples} requested dialogues")
+    else:
+        logger.info(f"Total dialogues loaded: {total_loaded}")
+    return all_dialogues
+def plot_training_history(history: Dict[str, List[float]], save_dir: Path = None):
+    """Plot and optionally save training history."""
+    # Create figure with two subplots
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
+    # Plot losses
+    ax1.plot(history['train_loss'], label='Train Loss')
+    ax1.plot(history['val_loss'], label='Validation Loss')
+    ax1.set_xlabel('Epoch')
+    ax1.set_ylabel('Triplet Loss')
+    ax1.set_title('Training and Validation Loss')
+    ax1.legend()
+    ax1.grid(True)
+    # Plot learning rate if available
+    if 'learning_rate' in history:
+        ax2.plot(history['learning_rate'], label='Learning Rate')
+        ax2.set_xlabel('Step')
+        ax2.set_ylabel('Learning Rate')
+        ax2.set_title('Learning Rate Schedule')
+        ax2.legend()
+        ax2.grid(True)
+    plt.tight_layout()
+    # Save if directory provided
+    if save_dir:
+        save_dir = Path(save_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        plt.savefig(save_dir / f'training_history_{timestamp}.png')
+    plt.show()
+def setup_training_directories(base_dir: str = "chatbot_training") -> Dict[str, Path]:
+    """Setup directory structure for training artifacts."""
+    base_dir = Path(base_dir)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    train_dir = base_dir / f"training_run_{timestamp}"
+    directories = {
+        'base': train_dir,
+        'checkpoints': train_dir / 'checkpoints',
+        'plots': train_dir / 'plots',
+        'logs': train_dir / 'logs'
+    }
+    # Create directories
+    for dir_path in directories.values():
+        dir_path.mkdir(parents=True, exist_ok=True)
+    return directories
+def run_automatic_validation(
+    chatbot,
+    response_pool: List[str],
+    quality_checker: ResponseQualityChecker,
+    num_examples: int = 5
+) -> Dict[str, Any]:
+    """
+    Run automatic validation with quality metrics.
+    """
+    logger.info("\n=== Running Automatic Validation ===")
+    test_queries = [
+        "Hello, how are you today?",
+        "What's the weather like?",
+        "Can you help me with a problem?",
+        "Tell me a joke",
+        "What time is it?",
+        "I need help with my homework",
+        "Where's a good place to eat?",
+        "What movies are playing?",
+        "How do I reset my password?",
+        "Can you recommend a book?"
+    ]
+    test_queries = test_queries[:num_examples]
+    metrics_history = []
+    for i, query in enumerate(test_queries, 1):
+        logger.info(f"\nTest Case {i}:")
+        logger.info(f"Query: {query}")
+        # Get responses and scores
+        responses = chatbot.retrieve_responses(
+            query,
+            response_pool,
+            context=None,
+            top_k=5
+        )
+        # Check quality
+        quality_metrics = quality_checker.check_response_quality(
+            query, responses, response_pool
+        )
+        metrics_history.append(quality_metrics)
+        # Log results
+        logger.info(f"Quality Metrics: {quality_metrics}")
+        logger.info("Top responses:")
+        for j, (response, score) in enumerate(responses[:3], 1):
+            logger.info(f"{j}. Score: {score:.4f}")
+            logger.info(f"   Response: {response}")
+            if j == 1 and not quality_metrics['is_confident']:
+                logger.info("   [Low Confidence - Would abstain from answering]")
+    # Calculate aggregate metrics
+    aggregate_metrics = {
+        'num_queries_tested': len(test_queries),
+        'avg_top_response_score': np.mean([m['top_score'] for m in metrics_history]),
+        'avg_diversity': np.mean([m['response_diversity'] for m in metrics_history]),
+        'avg_relevance': np.mean([m['query_response_relevance'] for m in metrics_history]),
+        'confidence_rate': np.mean([m['is_confident'] for m in metrics_history]),
+    }
+    logger.info("\n=== Validation Summary ===")
+    for metric, value in aggregate_metrics.items():
+        logger.info(f"{metric}: {value:.4f}")
+    return aggregate_metrics
+def chat_with_quality_check(
+    chatbot,
+    query: str,
+    response_pool: List[str],
+    conversation_history: List[Tuple[str, str]],
+    quality_checker: ResponseQualityChecker
+) -> Tuple[Optional[str], List[Tuple[str, float]], Dict[str, Any]]:
+    """
+    Enhanced chat function with quality checking.
+    """
+    # Get responses and scores
+    responses = chatbot.retrieve_responses(
+        query,
+        response_pool,
+        conversation_history
+    )
+    # Check quality
+    quality_metrics = quality_checker.check_response_quality(
+        query, responses, response_pool
+    )
+    if quality_metrics['is_confident']:
+        return responses[0][0], responses, quality_metrics
+    else:
+        uncertainty_response = (
+            "I apologize, but I don't feel confident providing an answer to that "
+            "question at the moment. Could you please rephrase or ask something else?"
+        )
+        return uncertainty_response, responses, quality_metrics
+def get_total_steps(dialogues: List[Dict[str, Any]], batch_size: int, epochs: int) -> int:
+    """
+    Calculate total training steps based on dialogues and batch size.
+    Assume 80% of data for training due to validation split
+    """
+    estimated_train_samples = int(len(dialogues) * 0.8)
+    steps_per_epoch = estimated_train_samples // batch_size
+    return steps_per_epoch * epochs
+def main():
+    DEBUG_SAMPLES = 350
+    BATCH_SIZE = 32
+    EPOCHS = 5 if DEBUG_SAMPLES else 10
+    # Setup training directories
+    dirs = setup_training_directories()
+    # Load training data
+    dialogues = load_training_data('processed_outputs', debug_samples=DEBUG_SAMPLES)
+    total_steps = get_total_steps(dialogues, BATCH_SIZE, EPOCHS)
+    # Initialize configuration
+    config = ChatbotConfig(
+        embedding_dim=32, # TODO: 256
+        encoder_units=32, # TODO: 256
+        num_attention_heads=2, # TODO: 8
+        warmup_steps=int(total_steps * 0.1),  # 10% of total steps for warmup
+    )
+    # Save configuration
+    with open(dirs['base'] / 'config.json', 'w') as f:
+        json.dump(config.to_dict(), f, indent=2)
+    # Initialize chatbot
+    chatbot = RetrievalChatbot(config)
+    # Prepare dataset
+    logger.info("Preparing dataset...")
+    # Prepare and train with debug samples
+    q_pad, p_pad, n_pad = chatbot.prepare_dataset(
+        dialogues,
+        neg_samples_per_pos=3,
+        debug_samples=DEBUG_SAMPLES
+    )
+    # Train model
+    logger.info("Starting training...")
+    chatbot.train(
+        q_pad, p_pad, n_pad,
+        epochs=EPOCHS,
+        batch_size=BATCH_SIZE,
+        checkpoint_dir=dirs['checkpoints']
+    )
+    # Plot and save training history
+    plot_training_history(chatbot.history, save_dir=dirs['plots'])
+    # Save final model
+    chatbot.save_models(dirs['base'] / 'final_model')
+    # Prepare response pool for chat
+    response_pool = [
+        turn['text'] for d in dialogues
+        for turn in d['turns'] if turn['speaker'] == 'assistant'
+    ]
+    # Initialize quality checker with appropriate thresholds
+    quality_checker = ResponseQualityChecker(
+        confidence_threshold=0.6 if not DEBUG_SAMPLES else 0.4,  # Lower threshold for debug
+        diversity_threshold=0.2,
+        min_response_length=10,
+        max_similarity_ratio=0.9
+    )
+    # Run automatic validation
+    validation_metrics = run_automatic_validation(
+        chatbot,
+        response_pool,
+        quality_checker,
+        num_examples=5 if DEBUG_SAMPLES else 10
+    )
+    # Log validation metrics
+    logger.info(f"Validation Metrics: {validation_metrics}")
+    # Now continue with interactive chat
+    logger.info("\nStarting interactive chat session...")
+    conversation_history = []
+    while True:
+        query = input("\nYou: ")
+        if query.lower() in ['quit', 'exit', 'bye']:
+            break
+        try:
+            response, candidates = chatbot.chat(
+                query,
+                response_pool,
+                conversation_history
+            )
+            print(f"\nAssistant: {response}")
+            # Print top alternative responses
+            print("\nAlternative responses:")
+            for resp, score in candidates[1:4]:
+                print(f"Score: {score:.4f} - {resp}")
+            # Update history
+            conversation_history.append((query, response))
+        except Exception as e:
+            logger.error(f"Error during chat: {str(e)}")
+            print("Sorry, I encountered an error. Please try again.")
+if __name__ == "__main__":
+    main()

run_model3.py ADDED Viewed

	@@ -0,0 +1,434 @@

+from chatbot3 import RetrievalChatbot, ChatbotConfig
+import os
+import json
+import glob
+import tensorflow as tf
+import matplotlib.pyplot as plt
+import logging
+from pathlib import Path
+from typing import List, Dict, Optional, Any, Tuple
+import numpy as np
+from datetime import datetime
+from response_quality_checker import ResponseQualityChecker
+import torch
+from transformers import TFAutoModel, AutoTokenizer
+policy = tf.keras.mixed_precision.Policy('mixed_float16')
+tf.keras.mixed_precision.set_global_policy(policy)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def setup_model_cache(cache_dir: Optional[Path] = None) -> Path:
+    """Setup and manage model cache directory."""
+    if cache_dir is None:
+        cache_dir = Path.home() / '.chatbot_cache'
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    # Set environment variables for various libraries
+    os.environ['TRANSFORMERS_CACHE'] = str(cache_dir / 'transformers')
+    os.environ['TORCH_HOME'] = str(cache_dir / 'torch')
+    os.environ['HF_HOME'] = str(cache_dir / 'huggingface')
+    logger.info(f"Using cache directory: {cache_dir}")
+    return cache_dir
+def setup_gpu():
+    """Configure GPU settings for optimal performance."""
+    logger.info("Checking GPU availability...")
+    gpus = tf.config.list_physical_devices('GPU')
+    if gpus:
+        try:
+            # Allow memory growth to prevent taking all GPU memory at once
+            for gpu in gpus:
+                tf.config.experimental.set_memory_growth(gpu, True)
+            logger.info(f"Found {len(gpus)} GPU(s). Memory growth enabled.")
+            # Log GPU info
+            for gpu in gpus:
+                logger.info(f"GPU Device: {gpu}")
+                return True
+        except Exception as e:
+            logger.error(f"Error configuring GPU: {str(e)}")
+            return False
+    else:
+        logger.info("No GPU found. Using CPU.")
+        return False
+def preload_models(config: ChatbotConfig, cache_dir: Path):
+    """Preload and cache models."""
+    logger.info("Preloading models...")
+    # Cache DistilBERT
+    model_name = config.pretrained_model
+    cache_path = cache_dir / 'transformers' / model_name
+    if not cache_path.exists():
+        logger.info(f"Downloading and caching {model_name}...")
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = TFAutoModel.from_pretrained(model_name)
+        # Save to cache
+        tokenizer.save_pretrained(cache_path)
+        model.save_pretrained(cache_path)
+    else:
+        logger.info(f"Using cached model from {cache_path}")
+    return cache_path
+def load_training_data(data_directory: str, debug_samples: Optional[int] = None) -> list:
+    """
+    Load and combine dialogue data from multiple JSON files.
+    Args:
+        data_directory: Directory containing the dialogue files
+        debug_samples: If set, only load this many dialogues for debugging
+    """
+    all_dialogues = []
+    data_directory = Path(data_directory)
+    # Get all json files matching the pattern
+    pattern = "batch_*.json"
+    json_files = sorted(data_directory.glob(pattern))
+    logger.info(f"Found {len(json_files)} batch files")
+    if debug_samples:
+        logger.info(f"Debug mode: Will load up to {debug_samples} dialogues")
+    for file_path in json_files:
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                batch_dialogues = json.load(f)
+                # If in debug mode, only take what we need from this batch
+                if debug_samples is not None:
+                    remaining_samples = debug_samples - len(all_dialogues)
+                    if remaining_samples <= 0:
+                        break
+                    batch_dialogues = batch_dialogues[:remaining_samples]
+                all_dialogues.extend(batch_dialogues)
+                logger.info(f"Loaded {len(batch_dialogues)} dialogues from {file_path.name}")
+                # If we've reached our debug sample limit, stop loading
+                if debug_samples is not None and len(all_dialogues) >= debug_samples:
+                    logger.info(f"Debug mode: Reached {debug_samples} samples, stopping load")
+                    break
+        except Exception as e:
+            logger.error(f"Error loading {file_path}: {str(e)}")
+    total_loaded = len(all_dialogues)
+    if debug_samples:
+        logger.info(f"Debug mode: Loaded {total_loaded}/{debug_samples} requested dialogues")
+    else:
+        logger.info(f"Total dialogues loaded: {total_loaded}")
+    return all_dialogues
+def plot_training_history(history: Dict[str, List[float]], save_dir: Path = None):
+    """Plot and optionally save training history."""
+    # Create figure with two subplots
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
+    # Plot losses
+    ax1.plot(history['train_loss'], label='Train Loss')
+    ax1.plot(history['val_loss'], label='Validation Loss')
+    ax1.set_xlabel('Epoch')
+    ax1.set_ylabel('Triplet Loss')
+    ax1.set_title('Training and Validation Loss')
+    ax1.legend()
+    ax1.grid(True)
+    # Plot learning rate if available
+    if 'learning_rate' in history:
+        ax2.plot(history['learning_rate'], label='Learning Rate')
+        ax2.set_xlabel('Step')
+        ax2.set_ylabel('Learning Rate')
+        ax2.set_title('Learning Rate Schedule')
+        ax2.legend()
+        ax2.grid(True)
+    plt.tight_layout()
+    # Save if directory provided
+    if save_dir:
+        save_dir = Path(save_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        plt.savefig(save_dir / f'training_history_{timestamp}.png')
+    plt.show()
+def setup_training_directories(base_dir: str = "chatbot_training") -> Dict[str, Path]:
+    """Setup directory structure for training artifacts."""
+    base_dir = Path(base_dir)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    train_dir = base_dir / f"training_run_{timestamp}"
+    directories = {
+        'base': train_dir,
+        'checkpoints': train_dir / 'checkpoints',
+        'plots': train_dir / 'plots',
+        'logs': train_dir / 'logs'
+    }
+    # Create directories
+    for dir_path in directories.values():
+        dir_path.mkdir(parents=True, exist_ok=True)
+    return directories
+def run_automatic_validation(
+    chatbot,
+    response_pool: List[str],
+    quality_checker: ResponseQualityChecker,
+    num_examples: int = 5
+) -> Dict[str, Any]:
+    """
+    Run automatic validation with quality metrics.
+    """
+    logger.info("\n=== Running Automatic Validation ===")
+    test_queries = [
+        "Hello, how are you today?",
+        "What's the weather like?",
+        "Can you help me with a problem?",
+        "Tell me a joke",
+        "What time is it?",
+        "I need help with my homework",
+        "Where's a good place to eat?",
+        "What movies are playing?",
+        "How do I reset my password?",
+        "Can you recommend a book?"
+    ]
+    test_queries = test_queries[:num_examples]
+    metrics_history = []
+    for i, query in enumerate(test_queries, 1):
+        logger.info(f"\nTest Case {i}:")
+        logger.info(f"Query: {query}")
+        # Get responses and scores
+        responses = chatbot.retrieve_responses(
+            query,
+            response_pool,
+            context=None,
+            top_k=5
+        )
+        # Check quality
+        quality_metrics = quality_checker.check_response_quality(
+            query, responses, response_pool
+        )
+        metrics_history.append(quality_metrics)
+        # Log results
+        logger.info(f"Quality Metrics: {quality_metrics}")
+        logger.info("Top responses:")
+        for j, (response, score) in enumerate(responses[:3], 1):
+            logger.info(f"{j}. Score: {score:.4f}")
+            logger.info(f"   Response: {response}")
+            if j == 1 and not quality_metrics['is_confident']:
+                logger.info("   [Low Confidence - Would abstain from answering]")
+    # Calculate aggregate metrics
+    aggregate_metrics = {
+        'num_queries_tested': len(test_queries),
+        'avg_top_response_score': np.mean([m['top_score'] for m in metrics_history]),
+        'avg_diversity': np.mean([m['response_diversity'] for m in metrics_history]),
+        'avg_relevance': np.mean([m['query_response_relevance'] for m in metrics_history]),
+        'confidence_rate': np.mean([m['is_confident'] for m in metrics_history]),
+    }
+    logger.info("\n=== Validation Summary ===")
+    for metric, value in aggregate_metrics.items():
+        logger.info(f"{metric}: {value:.4f}")
+    return aggregate_metrics
+def chat_with_quality_check(
+    chatbot,
+    query: str,
+    response_pool: List[str],
+    conversation_history: List[Tuple[str, str]],
+    quality_checker: ResponseQualityChecker
+) -> Tuple[Optional[str], List[Tuple[str, float]], Dict[str, Any]]:
+    """
+    Enhanced chat function with quality checking.
+    """
+    # Get responses and scores
+    responses = chatbot.retrieve_responses(
+        query,
+        response_pool,
+        conversation_history
+    )
+    # Check quality
+    quality_metrics = quality_checker.check_response_quality(
+        query, responses, response_pool
+    )
+    if quality_metrics['is_confident']:
+        return responses[0][0], responses, quality_metrics
+    else:
+        uncertainty_response = (
+            "I apologize, but I don't feel confident providing an answer to that "
+            "question at the moment. Could you please rephrase or ask something else?"
+        )
+        return uncertainty_response, responses, quality_metrics
+def get_total_steps(dialogues: List[Dict[str, Any]], batch_size: int, epochs: int) -> int:
+    """
+    Calculate total training steps based on dialogues and batch size.
+    Assume 80% of data for training due to validation split
+    """
+    estimated_train_samples = int(len(dialogues) * 0.8)
+    steps_per_epoch = estimated_train_samples // batch_size
+    return steps_per_epoch * epochs
+def main():
+    # Set up GPU
+    is_gpu = setup_gpu()
+    DEBUG_SAMPLES = 350
+    BATCH_SIZE = 64 if is_gpu else 32
+    EPOCHS = 5 if DEBUG_SAMPLES else 10
+    # Set device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    logger.info(f"Using device: {device}")
+    # Set up caching
+    cache_dir = setup_model_cache()
+    # Set up training directories
+    dirs = setup_training_directories()
+    # Load training data
+    dialogues = load_training_data('processed_outputs', debug_samples=DEBUG_SAMPLES)
+    total_steps = get_total_steps(dialogues, BATCH_SIZE, EPOCHS)
+    # Initialize configuration
+    config = ChatbotConfig(
+        embedding_dim=768,  # Match DistilBERT's dimension
+        encoder_units=256,
+        num_attention_heads=8,
+        warmup_steps=int(total_steps * 0.1),
+        learning_rate=0.0003,
+        margin=0.5,
+        pretrained_model='distilbert-base-uncased'
+    )
+    # Preload models
+    preload_models(config, cache_dir)
+    # Save configuration
+    with open(dirs['base'] / 'config.json', 'w') as f:
+        json.dump(config.to_dict(), f, indent=2)
+    # Initialize chatbot
+    chatbot = RetrievalChatbot(config)
+    # Prepare dataset
+    logger.info("Preparing dataset...")
+    # Prepare and train with debug samples
+    q_pad, p_pad, n_pad = chatbot.prepare_dataset(
+        dialogues,
+        neg_samples_per_pos=3,
+        debug_samples=DEBUG_SAMPLES
+    )
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs')
+    # Train model
+    logger.info("Starting training...")
+    chatbot.train(
+        q_pad, p_pad, n_pad,
+        epochs=EPOCHS,
+        batch_size=BATCH_SIZE,
+        validation_split=0.2,
+        checkpoint_dir=dirs['checkpoints'],
+        callbacks=[tensorboard_callback]
+    )
+    # Plot and save training history
+    plot_training_history(chatbot.history, save_dir=dirs['plots'])
+    # Save final model
+    chatbot.save_models(dirs['base'] / 'final_model')
+    # Prepare response pool for chat
+    response_pool = [
+        turn['text'] for d in dialogues
+        for turn in d['turns'] if turn['speaker'] == 'assistant'
+    ]
+    # Initialize quality checker with appropriate thresholds
+    quality_checker = ResponseQualityChecker(
+        confidence_threshold=0.6 if not DEBUG_SAMPLES else 0.4,  # Lower threshold for debug
+        diversity_threshold=0.2,
+        min_response_length=10,
+        max_similarity_ratio=0.9
+    )
+    # Run automatic validation
+    validation_metrics = run_automatic_validation(
+        chatbot,
+        response_pool,
+        quality_checker,
+        num_examples=5 if DEBUG_SAMPLES else 10
+    )
+    # Log validation metrics
+    logger.info(f"Validation Metrics: {validation_metrics}")
+    # Now continue with interactive chat
+    logger.info("\nStarting interactive chat session...")
+    conversation_history = []
+    while True:
+        query = input("\nYou: ")
+        if query.lower() in ['quit', 'exit', 'bye']:
+            break
+        try:
+            response, candidates, quality_metrics = chat_with_quality_check(
+                chatbot,
+                query,
+                response_pool,
+                conversation_history,
+                quality_checker
+            )
+            print(f"\nAssistant: {response}")
+            # Print top alternative responses if confident
+            if quality_metrics['is_confident']:
+                print("\nAlternative responses:")
+                for resp, score in candidates[1:4]:
+                    print(f"Score: {score:.4f} - {resp}")
+                # Update history only for confident responses
+                conversation_history.append((query, response))
+            else:
+                print("\nQuality metrics indicated low confidence:")
+                print(f"Confidence score: {quality_metrics['top_score']:.4f}")
+                print(f"Response relevance: {quality_metrics['query_response_relevance']:.4f}")
+        except Exception as e:
+            logger.error(f"Error during chat: {str(e)}")
+            print("Sorry, I encountered an error. Please try again.")
+if __name__ == "__main__":
+    main()

run_model4.py ADDED Viewed

	@@ -0,0 +1,237 @@

+from chatbot4 import RetrievalChatbot, ChatbotConfig
+import os
+import tensorflow as tf
+import matplotlib.pyplot as plt
+import logging
+from pathlib import Path
+from typing import List, Dict, Optional
+from datetime import datetime
+from response_quality_checker import ResponseQualityChecker
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def setup_model_cache(cache_dir: Optional[Path] = None) -> Path:
+    """Setup and manage model cache directory."""
+    if cache_dir is None:
+        cache_dir = Path.home() / '.chatbot_cache'
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    # Set environment variables for various libraries
+    os.environ['TRANSFORMERS_CACHE'] = str(cache_dir / 'transformers')
+    os.environ['TORCH_HOME'] = str(cache_dir / 'torch')
+    os.environ['HF_HOME'] = str(cache_dir / 'huggingface')
+    logger.info(f"Using cache directory: {cache_dir}")
+    return cache_dir
+def setup_gpu():
+    """Configure GPU settings for optimal performance."""
+    logger.info("Checking GPU availability...")
+    gpus = tf.config.list_physical_devices('GPU')
+    if gpus:
+        try:
+            # Allow memory growth to prevent taking all GPU memory at once
+            for gpu in gpus:
+                tf.config.experimental.set_memory_growth(gpu, True)
+            logger.info(f"Found {len(gpus)} GPU(s). Memory growth enabled.")
+            # Log GPU info
+            for gpu in gpus:
+                logger.info(f"GPU Device: {gpu}")
+                return True
+        except Exception as e:
+            logger.error(f"Error configuring GPU: {str(e)}")
+            return False
+    else:
+        logger.info("No GPU found. Using CPU.")
+        return False
+# def preload_models(config: ChatbotConfig, cache_dir: Path):
+#     """Preload and cache models."""
+#     logger.info("Preloading models...")
+#     # Cache DistilBERT
+#     model_name = config.pretrained_model
+#     cache_path = cache_dir / 'transformers' / model_name
+#     if not cache_path.exists():
+#         logger.info(f"Downloading and caching {model_name}...")
+#         tokenizer = AutoTokenizer.from_pretrained(model_name)
+#         model = TFAutoModel.from_pretrained(model_name)
+#         # Save to cache
+#         tokenizer.save_pretrained(cache_path)
+#         model.save_pretrained(cache_path)
+#     else:
+#         logger.info(f"Using cached model from {cache_path}")
+#     return cache_path
+def plot_training_history(history: Dict[str, List[float]], save_dir: Path = None):
+    """Plot and optionally save training history."""
+    # Create figure with two subplots
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
+    # Plot losses
+    ax1.plot(history['train_loss'], label='Train Loss')
+    ax1.plot(history['val_loss'], label='Validation Loss')
+    ax1.set_xlabel('Epoch')
+    ax1.set_ylabel('Triplet Loss')
+    ax1.set_title('Training and Validation Loss')
+    ax1.legend()
+    ax1.grid(True)
+    # Plot learning rate if available
+    if 'learning_rate' in history:
+        ax2.plot(history['learning_rate'], label='Learning Rate')
+        ax2.set_xlabel('Step')
+        ax2.set_ylabel('Learning Rate')
+        ax2.set_title('Learning Rate Schedule')
+        ax2.legend()
+        ax2.grid(True)
+    plt.tight_layout()
+    # Save if directory provided
+    if save_dir:
+        save_dir = Path(save_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        plt.savefig(save_dir / f'training_history_{timestamp}.png')
+    plt.show()
+def setup_training_directories(base_dir: str = "chatbot_training") -> Dict[str, Path]:
+    """Setup directory structure for training artifacts."""
+    base_dir = Path(base_dir)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    train_dir = base_dir / f"training_run_{timestamp}"
+    directories = {
+        'base': train_dir,
+        'checkpoints': train_dir / 'checkpoints',
+        'plots': train_dir / 'plots',
+        'logs': train_dir / 'logs'
+    }
+    # Create directories
+    for dir_path in directories.values():
+        dir_path.mkdir(parents=True, exist_ok=True)
+    return directories
+def main():
+    # Set up GPU
+    is_gpu = setup_gpu()
+    DEBUG_SAMPLES = 2000
+    BATCH_SIZE = 128 if is_gpu else 64
+    EPOCHS = 5 if DEBUG_SAMPLES else 10
+    # Set up caching
+    cache_dir = setup_model_cache()
+    # Set up training directories
+    dirs = setup_training_directories()
+    # Initialize configuration
+    config = ChatbotConfig(
+        embedding_dim=768,  # Match DistilBERT's dimension
+        max_sequence_length=512,
+        freeze_embeddings=False
+    )
+    # Preload models
+    #preload_models(config, cache_dir)
+    # Save configuration
+    # with open(dirs['base'] / 'config.json', 'w') as f:
+    #     json.dump(config.to_dict(), f, indent=4)
+    # Load training data
+    dialogues = RetrievalChatbot.load_training_data(data_path='processed_outputs/batch_group_0010.json', debug_samples=DEBUG_SAMPLES)
+    # Initialize chatbot
+    chatbot = RetrievalChatbot(config, dialogues)
+    # Check trainable variables
+    chatbot.check_trainable_variables()
+    # Verify FAISS
+    chatbot.verify_faiss_index()
+    # Prepare dataset
+    logger.info("Preparing dataset...")
+    # Prepare and train with debug samples
+    q_tensor, p_tensor = chatbot.prepare_dataset(dialogues)
+    quality_checker = ResponseQualityChecker(chatbot=chatbot)
+    # Train model
+    logger.info("Starting training...")
+    tf.config.optimizer.set_jit(True)  # XLA
+    policy = tf.keras.mixed_precision.Policy('mixed_float16')
+    tf.keras.mixed_precision.set_global_policy(policy)
+    chatbot.train(
+        q_pad=q_tensor,
+        p_pad=p_tensor,
+        epochs=EPOCHS,
+        batch_size=BATCH_SIZE,
+        validation_split=0.2,
+        checkpoint_dir="checkpoints/",
+        use_lr_schedule=True,     # Enable custom schedule
+        peak_lr=2e-5,             # Peak learning rate
+        warmup_steps_ratio=0.1,   # 10% warmup
+        early_stopping_patience=3 # Stop if no improvement for 3 epochs
+    )
+    # Plot and save training history
+    #plot_training_history(chatbot.history, save_dir=dirs['plots'])
+    # Save final model
+    chatbot.save_models(dirs['base'] / 'final_model')
+    # Run automatic validation
+    validation_metrics = chatbot.run_automatic_validation(quality_checker, num_examples=5)
+    # Log validation metrics
+    logger.info(f"Validation Metrics: {validation_metrics}")
+    # Now continue with interactive chat
+    logger.info("\nStarting interactive chat session...")
+    conversation_history = []
+    while True:
+        user_input = input("You: ")
+        if user_input.lower() in ['quit', 'exit', 'bye']:
+            print("Assistant: Goodbye!")
+            break
+        response, candidates, metrics = chatbot.chat(
+            query=user_input,
+            conversation_history=None,  # Pass conversation history if available
+            quality_checker=quality_checker,
+            top_k=5
+        )
+        print(f"Assistant: {response}")
+        # Optionally, display alternative responses
+        if metrics.get('is_confident', False):
+            print("\nAlternative responses:")
+            for resp, score in candidates[1:4]:
+                print(f"Score: {score:.4f} - {resp}")
+if __name__ == "__main__":
+    main()

setup.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from setuptools import setup, find_packages
 import subprocess
 import sys
 with open("README.md", "r", encoding="utf-8") as fh:
     long_description = fh.read()
@@ -8,11 +9,21 @@ with open("README.md", "r", encoding="utf-8") as fh:
 with open("requirements.txt", "r", encoding="utf-8") as fh:
     requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
-def setup_spacy_model():
     """
-    Download spaCy model.
     """
-    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
 def setup_models():
     """
@@ -22,10 +33,17 @@ def setup_models():
     from sklearn.feature_extraction.text import TfidfVectorizer
     from transformers import (
         AutoTokenizer,
         GPT2TokenizerFast,
-        MarianTokenizer
     )
     # Download Universal Sentence Encoder
     _ = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
@@ -63,12 +81,48 @@ def setup_nltk():
         except Exception as e:
             print(f"Warning: Could not download {package}: {str(e)}")
 setup(
     name="text-data-augmenter",
     version="0.1.0",
     author="Joe Armani",
     author_email="[email protected]",
     description="A tool for generating high-quality dialogue variations",
     packages=find_packages(),
     classifiers=[
         "Development Status :: 3 - Alpha",
@@ -95,6 +149,7 @@ setup(
 )
 if __name__ == '__main__':
-    setup_spacy_model()
     setup_models()
-    setup_nltk()

 from setuptools import setup, find_packages
 import subprocess
 import sys
+import platform
 with open("README.md", "r", encoding="utf-8") as fh:
     long_description = fh.read()
 with open("requirements.txt", "r", encoding="utf-8") as fh:
     requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
+def setup_spacy_models(models=['en_core_web_sm', 'en_core_web_md']):
     """
+    Download the specified spaCy model.
+    Args:
+        models(List): List[str] of the names of the spaCy model to download.
     """
+    try:
+        for model in models:
+            print(f"Downloading spaCy model: {model}")
+            subprocess.check_call([sys.executable, "-m", "spacy", "download", model])
+            print(f"Successfully downloaded spaCy model: {model}")
+    except subprocess.CalledProcessError as e:
+        print(f"Error downloading spaCy model: {model}")
+        print(e)
 def setup_models():
     """
     from sklearn.feature_extraction.text import TfidfVectorizer
     from transformers import (
         AutoTokenizer,
+        AutoModel,
         GPT2TokenizerFast,
+        MarianTokenizer,
+        DistilBertTokenizer,
+        DistilBertModel
     )
+    # Download DistilBERT for chatbot
+    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    model = DistilBertModel.from_pretrained('distilbert-base-uncased')
     # Download Universal Sentence Encoder
     _ = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
         except Exception as e:
             print(f"Warning: Could not download {package}: {str(e)}")
+def setup_faiss():
+    """
+    Download required faiss library.
+    """
+    current_os = platform.system()
+    cuda_available = False
+    # Function to check CUDA availability
+    def check_cuda():
+        try:
+            import torch
+            return torch.cuda.is_available()
+        except:
+            return False
+    if current_os == "Linux" and check_cuda():
+        # Attempt to install faiss-gpu
+        try:
+            print("Attempting to install faiss-gpu...")
+            subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-gpu>=1.7.0"])
+            print("Successfully installed faiss-gpu")
+            return
+        except subprocess.CalledProcessError:
+            print("Failed to install faiss-gpu. Falling back to faiss-cpu.")
+    # Install faiss-cpu as the default
+    try:
+        print("Installing faiss-cpu...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-cpu>=1.7.0"])
+        print("Successfully installed faiss-cpu")
+    except subprocess.CalledProcessError as e:
+        print("Error installing faiss-cpu")
+        print(e)
 setup(
     name="text-data-augmenter",
     version="0.1.0",
     author="Joe Armani",
     author_email="[email protected]",
     description="A tool for generating high-quality dialogue variations",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
     packages=find_packages(),
     classifiers=[
         "Development Status :: 3 - Alpha",
 )
 if __name__ == '__main__':
+    setup_spacy_models()
     setup_models()
+    setup_nltk()
+    setup_faiss()

test_trained_model.py ADDED Viewed

File without changes