Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Dec 30, 2024

Commit

2183656

1 Parent(s): 775baf9

upgrade to tf-dataset

Browse files

Files changed (3) hide show

chatbot_model.py +648 -726
deduplicate_augmented_dialogues.py +74 -0
run_model_train.py +3 -3

chatbot_model.py CHANGED Viewed

@@ -2,8 +2,6 @@ import time
 from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
 import numpy as np
-import threading
-from queue import Queue, Empty
 from typing import Generator, List, Tuple, Dict, Optional, Union, Any
 import math
 from dataclasses import dataclass
@@ -12,7 +10,6 @@ from pathlib import Path
 import datetime
 import faiss
 import gc
-import random
 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
 from conversation_summarizer import DeviceAwareModel, Summarizer
@@ -29,7 +26,7 @@ class ChatbotConfig:
     """Configuration for the RetrievalChatbot."""
     vocab_size: int = 30526  # DistilBERT vocab size + special tokens
     max_context_token_limit: int = 512
-    embedding_dim: int = 512
     encoder_units: int = 256
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
@@ -42,6 +39,7 @@ class ChatbotConfig:
     pretrained_model: str = 'distilbert-base-uncased'
     dtype: str = 'float32'
     freeze_embeddings: bool = False
     # Additional configurations can be added here
     def to_dict(self) -> dict:
@@ -103,9 +101,9 @@ class EncoderModel(tf.keras.Model):
         # Apply pooling, projection, dropout, and normalization
         x = self.pooler(x)  # Shape: [batch_size, 768]
-        x = self.projection(x) # Shape: [batch_size, 512]
         x = self.dropout(x, training=training) # Apply dropout
-        x = self.normalize(x)  # Shape: [batch_size, 512]
         return x
@@ -139,17 +137,6 @@ class RetrievalChatbot(DeviceAwareModel):
             summarizer = Summarizer(device=self.device)
         self.summarizer = summarizer
-        # # Configure XLA optimization if on GPU/TPU
-        # if self.device in ["GPU", "TPU"]:
-        #     tf.config.optimizer.set_jit(True)
-        #     logger.info(f"XLA compilation enabled for {self.device}")
-        # # Configure mixed precision for GPU/TPU
-        # if self.device != "CPU":
-        #     policy = tf.keras.mixed_precision.Policy('mixed_float16')
-        #     tf.keras.mixed_precision.set_global_policy(policy)
-        #     logger.info("Mixed precision training enabled (float16)")
         # Special tokens
         self.special_tokens = {
             "user": "<USER>",
@@ -354,6 +341,9 @@ class RetrievalChatbot(DeviceAwareModel):
         """
         all_embeddings = []
         self.current_batch_size = batch_size
         # Memory stats
         # if self.memory_monitor.has_gpu:
@@ -541,9 +531,9 @@ class RetrievalChatbot(DeviceAwareModel):
         logger.info("Starting vector addition process...")
         # Even smaller batches
-        initial_batch_size = 50  # Start smaller
-        min_batch_size = 10
-        max_batch_size = 500  # Lower maximum
         total_added = 0
         retry_count = 0
@@ -572,7 +562,6 @@ class RetrievalChatbot(DeviceAwareModel):
                 # Update progress
                 batch_size = len(batch)
                 total_added += batch_size
-                #logger.info(f"Added batch of {batch_size} vectors ({total_added}/{len(response_embeddings)} total)")
                 # Memory cleanup every few batches
                 if total_added % (initial_batch_size * 5) == 0:
@@ -618,7 +607,7 @@ class RetrievalChatbot(DeviceAwareModel):
                 cpu_index = self.index
             # Add remaining vectors on CPU with very small batches
-            batch_size = 50  # Extremely conservative batch size for CPU
             total_added = already_added
             for i in range(0, len(remaining_embeddings), batch_size):
@@ -911,36 +900,33 @@ class RetrievalChatbot(DeviceAwareModel):
         warmup_steps_ratio: float = 0.1,
         early_stopping_patience: int = 3,
         min_delta: float = 1e-4,
-        buffer_size: int = 10,
         neg_samples: int = 1
     ) -> None:
-        """
-        Streaming version of training that interleaves training/val batches by
-        giving priority to training until we meet `steps_per_epoch`, then
-        sending leftover batches to validation.
-        """
-        logger.info("Starting streaming training pipeline...")
-        # Initialize dataset preparer
-        dataset_preparer = StreamingDataPipeline(
             tokenizer=self.tokenizer,
             encoder=self.encoder,
-            index=self.index,
             response_pool=self.response_pool,
             max_length=self.config.max_context_token_limit,
-            batch_size=batch_size,
             neg_samples=neg_samples
         )
         # Calculate total steps for learning rate schedule
         total_pairs = dataset_preparer.estimate_total_pairs(dialogues)
-        train_size = total_pairs * (1 - validation_split)
         steps_per_epoch = int(math.ceil(train_size / batch_size))
-        val_steps = int(math.ceil((total_pairs * validation_split) / batch_size))
         total_steps = steps_per_epoch * epochs
         logger.info(f"Total pairs: {total_pairs}")
         logger.info(f"Training pairs: {train_size}")
         logger.info(f"Steps per epoch: {steps_per_epoch}")
         logger.info(f"Validation steps: {val_steps}")
         logger.info(f"Total steps: {total_steps}")
@@ -971,276 +957,245 @@ class RetrievalChatbot(DeviceAwareModel):
         val_log_dir = str(log_dir / f"val_{current_time}")
         train_summary_writer = tf.summary.create_file_writer(train_log_dir)
         val_summary_writer = tf.summary.create_file_writer(val_log_dir)
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
         # Training loop
         best_val_loss = float("inf")
         epochs_no_improve = 0
-        try:
-            epoch_pbar = tqdm(range(1, epochs + 1), desc="Training", unit="epoch")
-            is_tqdm_epoch = True
-        except ImportError:
-            epoch_pbar = range(1, epochs + 1)
-            is_tqdm_epoch = False
-            logger.info("Epoch progress bar disabled - continuing without visual progress")
-        for epoch in epoch_pbar:
-            # Shared queues for streaming pipeline
-            train_queue = Queue(maxsize=buffer_size)
-            val_queue = Queue(maxsize=buffer_size)
-            stop_flag = threading.Event()
-            def data_pipeline_worker():
-                """Thread function that processes dialogues and sends batches to train or val."""
-                try:
-                    train_batches_needed = steps_per_epoch  # 9 in your logs
-                    val_batches_needed = val_steps          # 3 in your logs
-                    train_batches_sent = 0
-                    val_batches_sent = 0
-                    logger.info(f"Pipeline starting: need {train_batches_needed} train batches, {val_batches_needed} val batches")
-                    # Possibly shuffle your processed pairs to avoid repeating them in the same order
-                    # (If you haven't already done so in the pipeline)
-                    random.shuffle(dataset_preparer.processed_pairs)
-                    while (train_batches_sent < train_batches_needed or
-                        val_batches_sent < val_batches_needed):
-                        # We loop over the generator
-                        for batch in dataset_preparer.process_dialogues(dialogues):
-                            if stop_flag.is_set():
-                                logger.warning("Pipeline stopped early")
-                                break
-                            if train_batches_sent < train_batches_needed:
-                                train_queue.put(batch)
-                                train_batches_sent += 1
-                            elif val_batches_sent < val_batches_needed:
-                                val_queue.put(batch)
-                                val_batches_sent += 1
-                            else:
-                                # We have enough batches for both train & val
-                                break
-                        # If we still haven't met our target steps, REPEAT the data
-                        if train_batches_sent < train_batches_needed or val_batches_sent < val_batches_needed:
-                            logger.info("Data exhausted, repeating since we still need more batches...")
-                            # Optionally shuffle again
-                            random.shuffle(dataset_preparer.processed_pairs)
-                        else:
-                            # We have enough
-                            break
-                    logger.info(
-                        f"Pipeline complete: sent {train_batches_sent}/{train_batches_needed} train batches, "
-                        f"{val_batches_sent}/{val_batches_needed} val batches"
-                    )
-                except Exception as e:
-                    logger.error(f"Error in pipeline worker: {str(e)}")
-                    raise e
-                finally:
-                    train_queue.put(None)
-                    val_queue.put(None)
-            # Start data preparation pipeline in background thread
-            pipeline_thread = threading.Thread(target=data_pipeline_worker)
-            pipeline_thread.start()
             try:
-                # --- Training Phase ---
-                epoch_loss_avg = tf.keras.metrics.Mean()
-                batches_processed = 0
-                try:
-                    train_pbar = tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}")
-                    is_tqdm_train = True
-                except ImportError:
-                    train_pbar = None
-                    is_tqdm_train = False
-                    logger.info("Training progress bar disabled")
-                while batches_processed < steps_per_epoch:
-                    try:
-                        batch = train_queue.get(timeout=1200) # 20 minutes timeout
-                        if batch is None:
-                            logger.warning(f"Received end signal after only {batches_processed}/{steps_per_epoch} batches")
-                            break
-                        q_batch, p_batch = batch[0], batch[1]
-                        attention_mask = batch[2] if len(batch) > 2 else None
-                        loss = self.train_step(q_batch, p_batch, attention_mask)
-                        epoch_loss_avg(loss)
-                        batches_processed += 1
-                        # Log to TensorBoard
-                        with train_summary_writer.as_default():
-                            tf.summary.scalar("loss", loss, step=epoch)
-                        # Update progress bar
-                        if use_lr_schedule:
-                            current_lr = float(lr_schedule(self.optimizer.iterations))
-                        else:
-                            current_lr = float(self.optimizer.learning_rate.numpy())
-                        if is_tqdm_train:
-                            train_pbar.update(1)
-                            train_pbar.set_postfix({
-                                "loss": f"{loss.numpy():.4f}",
-                                "lr": f"{current_lr:.2e}",
-                                "batches": f"{batches_processed}/{steps_per_epoch}"
-                            })
-                    except Empty:
-                        logger.warning(f"Queue timeout after {batches_processed}/{steps_per_epoch} batches")
-                        break
-                if is_tqdm_train and train_pbar:
-                    train_pbar.close()
-                # --- Validation Phase ---
-                val_loss_avg = tf.keras.metrics.Mean()
-                val_batches_processed = 0
-                try:
-                    val_pbar = tqdm(total=val_steps, desc="Validation")
-                    is_tqdm_val = True
-                except ImportError:
-                    val_pbar = None
-                    is_tqdm_val = False
-                    logger.info("Validation progress bar disabled")
-                while val_batches_processed < val_steps:
-                    try:
-                        batch = val_queue.get(timeout=30)
-                        if batch is None:
-                            logger.warning(
-                                f"Received end signal after {val_batches_processed}/{val_steps} validation batches"
-                            )
-                            break
-                        q_batch, p_batch = batch[0], batch[1]
-                        attention_mask = batch[2] if len(batch) > 2 else None
-                        val_loss = self.validation_step(q_batch, p_batch, attention_mask)
-                        val_loss_avg(val_loss)
-                        val_batches_processed += 1
-                        if is_tqdm_val:
-                            val_pbar.update(1)
-                            val_pbar.set_postfix({
-                                "val_loss": f"{val_loss.numpy():.4f}",
-                                "batches": f"{val_batches_processed}/{val_steps}"
-                            })
-                    except Empty:
-                        logger.warning(
-                            f"Validation queue timeout after {val_batches_processed}/{val_steps} batches"
-                        )
-                        break
-                if is_tqdm_val and val_pbar:
-                    val_pbar.close()
-                # End of epoch: compute final epoch stats
-                train_loss = epoch_loss_avg.result().numpy()
-                val_loss = val_loss_avg.result().numpy()
-                logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
-                # Log epoch metrics
-                with val_summary_writer.as_default():
-                    tf.summary.scalar("val_loss", val_loss, step=epoch)
-                # Save checkpoint
-                manager.save()
-                # Store metrics in history
-                self.history['train_loss'].append(train_loss)
-                self.history['val_loss'].append(val_loss)
-                if use_lr_schedule:
-                    current_lr = float(lr_schedule(self.optimizer.iterations))
-                else:
-                    current_lr = float(self.optimizer.learning_rate.numpy())
-                self.history.setdefault('learning_rate', []).append(current_lr)
-                # Early stopping logic
-                if val_loss < best_val_loss - min_delta:
-                    best_val_loss = val_loss
-                    epochs_no_improve = 0
-                    logger.info(f"Validation loss improved to {val_loss:.4f}. Reset patience.")
-                else:
-                    epochs_no_improve += 1
-                    logger.info(f"No improvement this epoch. Patience: {epochs_no_improve}/{early_stopping_patience}")
-                    if epochs_no_improve >= early_stopping_patience:
-                        logger.info("Early stopping triggered.")
-                        break
-            except Exception as e:
-                logger.error(f"Error during training: {str(e)}")
-                stop_flag.set()
-                raise e
-            finally:
-                # Clean up epoch resources
-                stop_flag.set()
-                pipeline_thread.join()
         logger.info("Streaming training completed!")
     @tf.function
-    def train_step(self, q_batch: tf.Tensor, p_batch: tf.Tensor, attention_mask: Optional[tf.Tensor] = None) -> tf.Tensor:
-        """Single training step with tf.function optimization and partial batch handling."""
         with tf.GradientTape() as tape:
-            q_enc = self.encoder(q_batch, training=True)
-            p_enc = self.encoder(p_batch, training=True)
-            sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)
-            # Handle partial batches
-            batch_size = tf.shape(q_enc)[0]
-            labels = tf.range(batch_size, dtype=tf.int32)
             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-                labels=labels, logits=sim_matrix
             )
-            # If there's an attention mask, apply it
             if attention_mask is not None:
                 loss = loss * attention_mask
-                # normalize by the sum of attention_mask
                 loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
-            else:
-                loss = tf.reduce_mean(loss)
         gradients = tape.gradient(loss, self.encoder.trainable_variables)
         self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
         return loss
     @tf.function
-    def validation_step(self, q_batch: tf.Tensor, p_batch: tf.Tensor, attention_mask: Optional[tf.Tensor] = None) -> tf.Tensor:
-        """Single validation step with partial batch handling."""
         q_enc = self.encoder(q_batch, training=False)
         p_enc = self.encoder(p_batch, training=False)
-        sim_matrix = tf.matmul(q_enc, p_enc, transpose_b=True)
-        batch_size = tf.shape(q_enc)[0]
-        labels = tf.range(batch_size, dtype=tf.int32)
         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            labels=labels, logits=sim_matrix
         )
         if attention_mask is not None:
             loss = loss * attention_mask
             loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
-        else:
-            loss = tf.reduce_mean(loss)
         return loss
@@ -1382,235 +1337,33 @@ class RetrievalChatbot(DeviceAwareModel):
         conversation_parts.append(f"{self.special_tokens['user']} {query}")
         return "\n".join(conversation_parts)
-class StreamingDataPipeline:
-    """Helper class to manage the streaming data preparation pipeline with optimized caching and GPU usage."""
     def __init__(
-        self,
-        tokenizer,
-        encoder,
-        index,
-        response_pool,
-        max_length: int,
-        batch_size: int,
-        neg_samples: int
     ):
         self.tokenizer = tokenizer
         self.encoder = encoder
-        self.index = index
         self.response_pool = response_pool
         self.max_length = max_length
-        self.base_batch_size = batch_size
         self.neg_samples = neg_samples
         self.memory_monitor = GPUMemoryMonitor()
-        # Caching structures
-        self.hard_negatives_cache = {}
-        self.processed_pairs = []
-        self.query_embeddings_cache = {}
-        # Error tracking
-        self.error_count = 0
         self.max_retries = 3
-        # Batch processing settings
-        self.current_batch_size = batch_size
-        self.batch_increase_factor = 1.25
-        # TODO: use GPU/strategy
-        if len(response_pool) < 100:
-            self.embedding_batch_size = 16
-            self.search_batch_size = 16
-            self.max_batch_size = 32
-            self.min_batch_size = 4
-        else:
-            self.embedding_batch_size = 64
-            self.search_batch_size = 64
-            self.min_batch_size = 8
-            self.max_batch_size = 64
-    def save_cache(self, cache_dir: Path) -> None:
-        """Save all cached data for future runs."""
-        cache_dir = Path(cache_dir)
-        cache_dir.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Saving cache to {cache_dir}")
-        # Save embeddings cache
-        embeddings_path = cache_dir / "query_embeddings.npy"
-        np.save(
-            embeddings_path,
-            {k: v.numpy() if hasattr(v, 'numpy') else v
-             for k, v in self.query_embeddings_cache.items()}
-        )
-        # Save hard negatives and processed pairs
-        with open(cache_dir / "hard_negatives.json", 'w') as f:
-            json.dump(self.hard_negatives_cache, f)
-        with open(cache_dir / "processed_pairs.json", 'w') as f:
-            json.dump(self.processed_pairs, f)
-        logger.info("Cache saved successfully")
-    def load_cache(self, cache_dir: Path) -> bool:
-        """Load cached data if available."""
-        cache_dir = Path(cache_dir)
-        required_files = [
-            "query_embeddings.npy",
-            "hard_negatives.json",
-            "processed_pairs.json"
-        ]
-        if not all((cache_dir / f).exists() for f in required_files):
-            logger.info("Cache files not found")
-            return False
-        try:
-            logger.info("Loading cache...")
-            # Load embeddings
-            self.query_embeddings_cache = np.load(
-                cache_dir / "query_embeddings.npy",
-                allow_pickle=True
-            ).item()
-            # Load other caches
-            with open(cache_dir / "hard_negatives.json", 'r') as f:
-                self.hard_negatives_cache = json.load(f)
-            with open(cache_dir / "processed_pairs.json", 'r') as f:
-                self.processed_pairs = json.load(f)
-            logger.info(f"Cache loaded successfully: {len(self.processed_pairs)} pairs")
-            return True
-        except Exception as e:
-            logger.error(f"Error loading cache: {e}")
-            return False
-    def _adjust_batch_size(self) -> None:
-        """Dynamically adjust batch size based on GPU memory usage."""
-        if self.memory_monitor:
-            if self.memory_monitor.should_reduce_batch_size():
-                new_size = max(self.min_batch_size, self.current_batch_size // 2)
-                if new_size != self.current_batch_size:
-                    if new_size < self.min_batch_size:
-                        logger.info(f"Reducing batch size to {new_size} due to high memory usage")
-                    self.current_batch_size = new_size
-                    gc.collect()
-                    if tf.config.list_physical_devices('GPU'):
-                        tf.keras.backend.clear_session()
-            elif self.memory_monitor.can_increase_batch_size():
-                new_size = min(self.max_batch_size, int(self.current_batch_size * self.batch_increase_factor)) # More gradual increase
-                if new_size != self.current_batch_size:
-                    if new_size > self.max_batch_size:
-                        logger.info(f"Increasing batch size to {new_size}")
-                    self.current_batch_size = new_size
-    def _add_progress_metrics(self, pbar, **metrics) -> None:
-        """Add memory and batch size metrics to progress bars."""
-        if self.memory_monitor:
-            gpu_usage = self.memory_monitor.get_memory_usage()
-            metrics['gpu_mem'] = f"{gpu_usage:.1%}"
-        metrics['batch_size'] = self.current_batch_size
-        pbar.set_postfix(**metrics)
-    def preprocess_dialogues(self, dialogues: List[dict]) -> None:
-        """Preprocess all dialogues with error recovery and caching."""
-        retry_count = 0
-        while retry_count < self.max_retries:
-            try:
-                self._preprocess_dialogues_internal(dialogues)
-                break
-            except Exception as e:
-                retry_count += 1
-                logger.warning(f"Preprocessing attempt {retry_count} failed: {e}")
-                if retry_count == self.max_retries:
-                    logger.error("Max retries reached. Falling back to CPU processing")
-                    self._fallback_to_cpu_processing(dialogues)
-    def _preprocess_dialogues_internal(self, dialogues: List[dict]) -> None:
-        """Internal preprocessing implementation with progress tracking."""
-        logger.info("Starting dialogue preprocessing...")
-        # Collect unique queries and pairs
-        unique_queries = set()
-        query_positive_pairs = []
-        with tqdm(total=len(dialogues), desc="Collecting dialogue pairs") as pbar:
-            for dialogue in dialogues:
-                pairs = self._extract_pairs_from_dialogue(dialogue)
-                for query, positive in pairs:
-                    unique_queries.add(query)
-                    query_positive_pairs.append((query, positive))
-                pbar.update(1)
-                self._add_progress_metrics(pbar, pairs=len(query_positive_pairs))
-        # Precompute embeddings
-        logger.info("Precomputing query embeddings...")
-        self.precompute_query_embeddings(list(unique_queries))
-        # Find hard negatives
-        logger.info("Finding hard negatives for all pairs...")
-        self._find_hard_negatives_for_pairs(query_positive_pairs)
-    def precompute_query_embeddings(self, queries: List[str]) -> None:
-        """Precompute embeddings for all unique queries in batches."""
-        unique_queries = list(set(queries))
-        with tqdm(total=len(unique_queries), desc="Precomputing query embeddings") as pbar:
-            for i in range(0, len(unique_queries), self.embedding_batch_size):
-                # Adjust batch size based on memory
-                self._adjust_batch_size()
-                batch_size = min(self.embedding_batch_size, len(unique_queries) - i)
-                # Get batch of queries
-                batch_queries = unique_queries[i:i + batch_size]
-                try:
-                    # Tokenize batch
-                    encoded = self.tokenizer(
-                        batch_queries,
-                        padding=True,
-                        truncation=True,
-                        max_length=self.max_length,
-                        return_tensors='tf'
-                    )
-                    # Get embeddings
-                    embeddings = self.encoder(encoded['input_ids'], training=False)
-                    embeddings_np = embeddings.numpy().astype('float32')
-                    # Normalize for similarity search
-                    faiss.normalize_L2(embeddings_np)
-                    # Cache embeddings
-                    for query, emb in zip(batch_queries, embeddings_np):
-                        self.query_embeddings_cache[query] = emb
-                    pbar.update(len(batch_queries))
-                    self._add_progress_metrics(
-                        pbar,
-                        cached=len(self.query_embeddings_cache),
-                        batch_size=batch_size
-                    )
-                except Exception as e:
-                    logger.warning(f"Error processing batch: {e}")
-                    # Reduce batch size and retry
-                    self.embedding_batch_size = max(self.min_batch_size, self.embedding_batch_size // 2)
-                    continue
-                # Memory cleanup after successful batch
-                if i % (self.embedding_batch_size * 10) == 0:
-                    gc.collect()
-                    if tf.config.list_physical_devices('GPU'):
-                        tf.keras.backend.clear_session()
-        logger.info(f"Cached embeddings for {len(self.query_embeddings_cache)} unique queries")
     def _extract_pairs_from_dialogue(self, dialogue: dict) -> List[Tuple[str, str]]:
         """Extract query-response pairs from a dialogue."""
         pairs = []
@@ -1631,305 +1384,474 @@ class StreamingDataPipeline:
         return pairs
-    def _fallback_to_cpu_processing(self, dialogues: List[dict]) -> None:
-        """Fallback processing method using CPU only."""
-        logger.info("Falling back to CPU-only processing")
-        # Reset GPU-specific settings
-        self.current_batch_size = self.min_batch_size
-        self.embedding_batch_size = 32
-        self.search_batch_size = 16
-        # Attempt preprocessing with reduced batches
-        self._preprocess_dialogues_internal(dialogues)
-    def process_dialogues(self, dialogues: List[dict]) -> Generator[Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor]], None, None]:
-        """
-        Process dialogues using cached data with dynamic batch sizing.
-        Yields (q_tokens['input_ids'], p_tokens['input_ids'], attention_mask) tuples.
-        """
-        # Preprocess if not already done
-        if not self.processed_pairs:
-            self.preprocess_dialogues(dialogues)
-        # Generate batches from cached data
-        current_queries = []
-        current_positives = []
-        # Counters for logging
-        total_examples_yielded = 0
-        total_batches_yielded = 0
-        with tqdm(total=len(self.processed_pairs), desc="Generating training batches", leave=False) as pbar:
-            for i, (query, positive) in enumerate(self.processed_pairs):
-                # Periodically adjust batch size
-                if i % 10 == 0:  # Check more frequently (e.g., every 10 pairs)
-                    self._adjust_batch_size()
-                # Add original pair
-                current_queries.append(query)
-                current_positives.append(positive)
-                # Add cached hard negatives for each query
-                hard_negatives = self.hard_negatives_cache.get((query, positive), [])
-                for neg_text in hard_negatives:
-                    current_queries.append(query)
-                    current_positives.append(neg_text)
-                # If we have enough examples to form a full batch, yield it
-                while len(current_queries) >= self.current_batch_size:
-                    batch_queries = current_queries[:self.current_batch_size]
-                    batch_positives = current_positives[:self.current_batch_size]
-                    # Update counters and logs
-                    batch_size_to_yield = len(batch_queries)
-                    total_examples_yielded += batch_size_to_yield
-                    total_batches_yielded += 1
-                    yield self._prepare_batch(batch_queries, batch_positives, pad_to_batch_size=False)
-                    # Remove used entries
-                    current_queries = current_queries[self.current_batch_size:]
-                    current_positives = current_positives[self.current_batch_size:]
-                # Update progress bar
-                pbar.update(1)
-                self._add_progress_metrics(
-                    pbar,
-                    pairs_processed=pbar.n,
-                    pending_pairs=len(current_queries)
-                )
-        # After the loop, if anything is left, yield a final partial batch
-        if current_queries:
-            leftover_size = len(current_queries)
-            total_examples_yielded += leftover_size
-            total_batches_yielded += 1
-            yield self._prepare_batch(
-                current_queries,
-                current_positives,
-                pad_to_batch_size=True
-            )
-    def _find_hard_negatives_for_pairs(self, query_positive_pairs: List[Tuple[str, str]]) -> None:
-        """Process pairs in batches to find hard negatives with GPU acceleration."""
-        total_pairs = len(query_positive_pairs)
-        # Use smaller batch size for small datasets
-        if len(self.response_pool) < 1000:
-            batch_size = min(8, self.search_batch_size)
-        else:
-            batch_size = self.search_batch_size
-        try:
-            pbar = tqdm(total=total_pairs, desc="Finding hard negatives")
-            is_tqdm = True
-        except ImportError:
-            pbar = None
-            is_tqdm = False
-            logger.info("Progress bar disabled - continuing without visual progress")
-        for i in range(0, total_pairs, batch_size):
-            self._adjust_batch_size()
-            batch_pairs = query_positive_pairs[i:i + batch_size]
-            batch_queries, batch_positives = zip(*batch_pairs)
-            batch_negatives = self._find_hard_negatives_batch(
-                list(batch_queries),
-                list(batch_positives)
-            )
-            for query, positive, negatives in zip(batch_queries, batch_positives, batch_negatives):
-                self.hard_negatives_cache[(query, positive)] = negatives
-                self.processed_pairs.append((query, positive))
-            if is_tqdm:
-                pbar.update(len(batch_pairs))
-                self._add_progress_metrics(
-                    pbar,
-                    cached=len(self.processed_pairs),
-                    progress=f"{i+len(batch_pairs)}/{total_pairs}"
-                )
-        if is_tqdm:
-            pbar.close()
     def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
         """Find hard negatives for a batch of queries with error handling and retries."""
         retry_count = 0
         total_responses = len(self.response_pool)
-        # For very small datasets (testing), just use random sampling
-        if total_responses < 100:
-            all_negatives = []
-            for positive in positives:
-                available = [r for r in self.response_pool if r.strip() != positive.strip()]
-                if available:
-                    negatives = list(np.random.choice(
-                        available,
-                        size=min(self.neg_samples, len(available)),
-                        replace=False
-                    ))
-                else:
-                    negatives = []
-                # Pad with empty strings if needed
-                while len(negatives) < self.neg_samples:
-                    negatives.append("")
-                all_negatives.append(negatives)
-            return all_negatives
         while retry_count < self.max_retries:
             try:
-                # Get cached embeddings and ensure they're the right type
                 query_embeddings = np.vstack([
                     self.query_embeddings_cache[q] for q in queries
                 ]).astype(np.float32)
-                if not query_embeddings.flags['C_CONTIGUOUS']:
-                    query_embeddings = np.ascontiguousarray(query_embeddings)
-                # Normalize embeddings
                 faiss.normalize_L2(query_embeddings)
-                k = 1 #min(total_responses - 1, max(3, self.neg_samples + 2))
                 #logger.debug(f"Searching with k={k} among {total_responses} responses")
-                assert query_embeddings.dtype == np.float32, f"Embeddings are not float32: {query_embeddings.dtype}" # Assertion here
-                try:
-                    distances, indices = self.index.search(query_embeddings, k)
-                except RuntimeError as e:
-                    logger.error(f"FAISS search failed: {e}")
-                    return self._fallback_random_negatives(queries, positives)
-                # Process results
                 all_negatives = []
-                for i, (query_indices, query, positive) in enumerate(zip(indices, queries, positives)):
                     negatives = []
                     positive_strip = positive.strip()
-                    # Filter valid indices and deduplicate
                     seen = {positive_strip}
                     for idx in query_indices:
                         if idx >= 0 and idx < total_responses:
                             candidate = self.response_pool[idx].strip()
-                            if candidate and candidate not in seen:  # Check for non-empty strings
                                 seen.add(candidate)
                                 negatives.append(candidate)
                                 if len(negatives) >= self.neg_samples:
                                     break
-                    # If we don't have enough negatives, use random sampling from remaining pool
-                    if len(negatives) < self.neg_samples:
-                        available = [r for r in self.response_pool if r.strip() not in seen and r.strip()]
-                        if available:
-                            additional = np.random.choice(
-                                available,
-                                size=min(self.neg_samples - len(negatives), len(available)),
-                                replace=False
-                            )
-                            negatives.extend(additional)
-                    # Still pad with empty strings if needed
                     while len(negatives) < self.neg_samples:
-                        negatives.append("")
                     all_negatives.append(negatives)
                 return all_negatives
             except Exception as e:
                 retry_count += 1
                 logger.warning(f"Hard negative search attempt {retry_count} failed: {e}")
                 if retry_count == self.max_retries:
                     logger.error("Max retries reached for hard negative search")
-                    return [[] for _ in queries]  # Return empty lists on complete failure
                 gc.collect()
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
-    def _fallback_random_negatives(self, queries: List[str], positives: List[str]) -> List[List[str]]:
-        """Fallback to random sampling when similarity search fails."""
-        logger.warning("Falling back to random negative sampling")
-        all_negatives = []
-        for positive in positives:
-            available = [r for r in self.response_pool if r.strip() != positive.strip()]
-            negatives = list(np.random.choice(
-                available,
-                size=min(self.neg_samples, len(available)),
-                replace=False
-            )) if available else []
-            while len(negatives) < self.neg_samples:
-                negatives.append("")
-            all_negatives.append(negatives)
-        return all_negatives
-    def _prepare_batch(
-        self,
-        queries: List[str],
-        positives: List[str],
-        pad_to_batch_size: bool = False
-    ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor]]:
-        """Prepare a batch with dynamic padding and memory optimization."""
-        actual_size = len(queries)
-        # Handle padding if requested and needed
-        if pad_to_batch_size and actual_size < self.current_batch_size:
-            padding_needed = self.current_batch_size - actual_size
-            queries.extend([queries[0]] * padding_needed)
-            positives.extend([positives[0]] * padding_needed)
-            # Create attention mask for padded examples
-            attention_mask = tf.concat([
-                tf.ones((actual_size,), dtype=tf.float32),
-                tf.zeros((padding_needed,), dtype=tf.float32)
-            ], axis=0)
-        else:
-            attention_mask = None
-        try:
-            # Tokenize batch
-            q_tokens = self.tokenizer(
-                queries,
                 padding='max_length',
                 truncation=True,
                 max_length=self.max_length,
                 return_tensors='tf'
             )
-            p_tokens = self.tokenizer(
-                positives,
-                padding='max_length',
                 truncation=True,
                 max_length=self.max_length,
                 return_tensors='tf'
             )
-            return q_tokens['input_ids'], p_tokens['input_ids'], attention_mask
-        except Exception as e:
-            logger.error(f"Error preparing batch: {e}")
-            # Emergency memory cleanup
-            gc.collect()
-            if tf.config.list_physical_devices('GPU'):
-                tf.keras.backend.clear_session()
-            raise
-    def estimate_total_pairs(self, dialogues: List[dict]) -> int:
-        """Estimate total number of training pairs including hard negatives."""
-        base_pairs = sum(
-            len([
-                1 for i in range(len(d.get('turns', [])) - 1)
-                if (d['turns'][i].get('speaker') == 'user' and
-                    d['turns'][i+1].get('speaker') == 'assistant')
-            ])
-            for d in dialogues
         )
-        # Account for hard negatives
-        return base_pairs * (1 + self.neg_samples)
-    def cleanup(self):
-        """Cleanup resources and memory."""
-        self.query_embeddings_cache.clear()
-        gc.collect()
-        if tf.config.list_physical_devices('GPU'):
-            tf.keras.backend.clear_session()

 from transformers import TFAutoModel, AutoTokenizer
 import tensorflow as tf
 import numpy as np
 from typing import Generator, List, Tuple, Dict, Optional, Union, Any
 import math
 from dataclasses import dataclass
 import datetime
 import faiss
 import gc
 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
 from conversation_summarizer import DeviceAwareModel, Summarizer
     """Configuration for the RetrievalChatbot."""
     vocab_size: int = 30526  # DistilBERT vocab size + special tokens
     max_context_token_limit: int = 512
+    embedding_dim: int = 768
     encoder_units: int = 256
     num_attention_heads: int = 8
     dropout_rate: float = 0.2
     pretrained_model: str = 'distilbert-base-uncased'
     dtype: str = 'float32'
     freeze_embeddings: bool = False
+    embedding_batch_size: int = 128
     # Additional configurations can be added here
     def to_dict(self) -> dict:
         # Apply pooling, projection, dropout, and normalization
         x = self.pooler(x)  # Shape: [batch_size, 768]
+        x = self.projection(x) # Shape: [batch_size, 768]
         x = self.dropout(x, training=training) # Apply dropout
+        x = self.normalize(x)  # Shape: [batch_size, 768]
         return x
             summarizer = Summarizer(device=self.device)
         self.summarizer = summarizer
         # Special tokens
         self.special_tokens = {
             "user": "<USER>",
         """
         all_embeddings = []
         self.current_batch_size = batch_size
+        if self.memory_monitor.has_gpu:
+            batch_size = 128
         # Memory stats
         # if self.memory_monitor.has_gpu:
         logger.info("Starting vector addition process...")
         # Even smaller batches
+        initial_batch_size = 128
+        min_batch_size = 32
+        max_batch_size = 1024
         total_added = 0
         retry_count = 0
                 # Update progress
                 batch_size = len(batch)
                 total_added += batch_size
                 # Memory cleanup every few batches
                 if total_added % (initial_batch_size * 5) == 0:
                 cpu_index = self.index
             # Add remaining vectors on CPU with very small batches
+            batch_size = 128
             total_added = already_added
             for i in range(0, len(remaining_embeddings), batch_size):
         warmup_steps_ratio: float = 0.1,
         early_stopping_patience: int = 3,
         min_delta: float = 1e-4,
         neg_samples: int = 1
     ) -> None:
+        """Streaming training with tf.data pipeline."""
+        logger.info("Starting streaming training pipeline with tf.data...")
+        # Initialize TFDataPipeline (replaces StreamingDataPipeline)
+        dataset_preparer = TFDataPipeline(
+            embedding_batch_size=self.config.embedding_batch_size,
             tokenizer=self.tokenizer,
             encoder=self.encoder,
+            index=self.index,  # Pass CPU version of FAISS index
             response_pool=self.response_pool,
             max_length=self.config.max_context_token_limit,
             neg_samples=neg_samples
         )
         # Calculate total steps for learning rate schedule
         total_pairs = dataset_preparer.estimate_total_pairs(dialogues)
+        train_size = int(total_pairs * (1 - validation_split))
+        val_size = int(total_pairs * validation_split)
         steps_per_epoch = int(math.ceil(train_size / batch_size))
+        val_steps = int(math.ceil(val_size / batch_size))
         total_steps = steps_per_epoch * epochs
         logger.info(f"Total pairs: {total_pairs}")
         logger.info(f"Training pairs: {train_size}")
+        logger.info(f"Validation pairs: {val_size}")
         logger.info(f"Steps per epoch: {steps_per_epoch}")
         logger.info(f"Validation steps: {val_steps}")
         logger.info(f"Total steps: {total_steps}")
         val_log_dir = str(log_dir / f"val_{current_time}")
         train_summary_writer = tf.summary.create_file_writer(train_log_dir)
         val_summary_writer = tf.summary.create_file_writer(val_log_dir)
         logger.info(f"TensorBoard logs will be saved in {log_dir}")
+        # Create training and validation datasets
+        train_dataset = dataset_preparer.get_tf_dataset(dialogues, batch_size).take(train_size)
+        val_dataset = dataset_preparer.get_tf_dataset(dialogues, batch_size).skip(train_size).take(val_size)
         # Training loop
         best_val_loss = float("inf")
         epochs_no_improve = 0
+        for epoch in range(1, epochs + 1):
+            # --- Training Phase ---
+            epoch_loss_avg = tf.keras.metrics.Mean()
+            batches_processed = 0
+            try:
+                train_pbar = tqdm(total=steps_per_epoch, desc=f"Training Epoch {epoch}", unit="batch")
+                is_tqdm_train = True
+            except ImportError:
+                train_pbar = None
+                is_tqdm_train = False
+                logger.info("Training progress bar disabled")
+            for q_batch, p_batch, n_batch in train_dataset:
+                #p_batch = p_n_batch[:, 0, :] # Extract positive from (positive, negative) pair
+                loss = self.train_step(q_batch, p_batch, n_batch)
+                epoch_loss_avg(loss)
+                batches_processed += 1
+                # Log to TensorBoard
+                with train_summary_writer.as_default():
+                    tf.summary.scalar("loss", loss, step=(epoch - 1) * steps_per_epoch + batches_processed)
+                # Update progress bar
+                if use_lr_schedule:
+                    current_lr = float(lr_schedule(self.optimizer.iterations))
+                else:
+                    current_lr = float(self.optimizer.learning_rate.numpy())
+                if is_tqdm_train:
+                    train_pbar.update(1)
+                    train_pbar.set_postfix({
+                        "loss": f"{loss.numpy():.4f}",
+                        "lr": f"{current_lr:.2e}",
+                        "batches": f"{batches_processed}/{steps_per_epoch}"
+                    })
+                # Memory cleanup
+                gc.collect()
+                if batches_processed >= steps_per_epoch:
+                    break
+            if is_tqdm_train and train_pbar:
+                train_pbar.close()
+            # --- Validation Phase ---
+            val_loss_avg = tf.keras.metrics.Mean()
+            val_batches_processed = 0
             try:
+                val_pbar = tqdm(total=val_steps, desc="Validation", unit="batch")
+                is_tqdm_val = True
+            except ImportError:
+                val_pbar = None
+                is_tqdm_val = False
+                logger.info("Validation progress bar disabled")
+            for q_batch, p_batch, n_batch in val_dataset:
+                #p_batch = p_n_batch[:, 0, :] # Extract positive from (positive, negative) pair
+                val_loss = self.validation_step(q_batch, p_batch, n_batch)
+                val_loss_avg(val_loss)
+                val_batches_processed += 1
+                if is_tqdm_val:
+                    val_pbar.update(1)
+                    val_pbar.set_postfix({
+                        "val_loss": f"{val_loss.numpy():.4f}",
+                        "batches": f"{val_batches_processed}/{val_steps}"
+                    })
+                # Memory cleanup
+                gc.collect()
+                if val_batches_processed >= val_steps:
+                    break
+            if is_tqdm_val and val_pbar:
+                val_pbar.close()
+            # End of epoch: compute final epoch stats, log, and save checkpoint
+            train_loss = epoch_loss_avg.result().numpy()
+            val_loss = val_loss_avg.result().numpy()
+            logger.info(f"Epoch {epoch} Complete: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
+            # Log epoch metrics
+            with train_summary_writer.as_default():
+                tf.summary.scalar("epoch_loss", train_loss, step=epoch)
+            with val_summary_writer.as_default():
+                tf.summary.scalar("val_loss", val_loss, step=epoch)
+            # Save checkpoint
+            manager.save()
+            # Store metrics in history
+            self.history['train_loss'].append(train_loss)
+            self.history['val_loss'].append(val_loss)
+            if use_lr_schedule:
+                current_lr = float(lr_schedule(self.optimizer.iterations))
+            else:
+                current_lr = float(self.optimizer.learning_rate.numpy())
+            self.history.setdefault('learning_rate', []).append(current_lr)
+            # Early stopping logic
+            if val_loss < best_val_loss - min_delta:
+                best_val_loss = val_loss
+                epochs_no_improve = 0
+                logger.info(f"Validation loss improved to {val_loss:.4f}. Reset patience.")
+            else:
+                epochs_no_improve += 1
+                logger.info(f"No improvement this epoch. Patience: {epochs_no_improve}/{early_stopping_patience}")
+                if epochs_no_improve >= early_stopping_patience:
+                    logger.info("Early stopping triggered.")
+                    break
         logger.info("Streaming training completed!")
     @tf.function
+    def train_step(
+        self,
+        q_batch: tf.Tensor,
+        p_batch: tf.Tensor,
+        n_batch: tf.Tensor,
+        attention_mask: Optional[tf.Tensor] = None
+    ) -> tf.Tensor:
+        """
+        Single training step that uses queries, positives, and negatives in a
+        contrastive/InfoNCE style. The label is always 0 (the positive) vs.
+        the negative alternatives.
+        """
         with tf.GradientTape() as tape:
+            # Encode queries
+            q_enc = self.encoder(q_batch, training=True)  # [batch_size, embed_dim]
+            # Encode positives
+            p_enc = self.encoder(p_batch, training=True)  # [batch_size, embed_dim]
+            # Encode negatives
+            # n_batch: [batch_size, neg_samples, max_length]
+            shape = tf.shape(n_batch)
+            bs = shape[0]
+            neg_samples = shape[1]
+            # Flatten negatives to feed them in one pass:
+            # => [batch_size * neg_samples, max_length]
+            n_batch_flat = tf.reshape(n_batch, [bs * neg_samples, shape[2]])
+            n_enc_flat = self.encoder(n_batch_flat, training=True)  # [bs*neg_samples, embed_dim]
+            # Reshape back => [batch_size, neg_samples, embed_dim]
+            n_enc = tf.reshape(n_enc_flat, [bs, neg_samples, -1])
+            # Combine the positive embedding and negative embeddings along dim=1
+            # => shape [batch_size, 1 + neg_samples, embed_dim]
+            # The first column is the positive; subsequent columns are negatives
+            combined_p_n = tf.concat(
+                [tf.expand_dims(p_enc, axis=1), n_enc],
+                axis=1
+            )  # [bs, (1+neg_samples), embed_dim]
+            # Now compute scores: dot product of q_enc with each column in combined_p_n
+            # We'll use `tf.einsum` to handle the batch dimension properly
+            # dot_products => shape [batch_size, (1+neg_samples)]
+            dot_products = tf.einsum('bd,bkd->bk', q_enc, combined_p_n)
+            # The label for each row is 0 (the first column is the correct/positive)
+            labels = tf.zeros([bs], dtype=tf.int32)
+            # Cross-entropy over the [batch_size, 1+neg_samples] scores
             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=labels,
+                logits=dot_products
             )
+            loss = tf.reduce_mean(loss)
+            # If there's an attention_mask you want to apply (less common in this scenario),
+            # you could do something like:
             if attention_mask is not None:
                 loss = loss * attention_mask
                 loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
+        # Apply gradients
         gradients = tape.gradient(loss, self.encoder.trainable_variables)
         self.optimizer.apply_gradients(zip(gradients, self.encoder.trainable_variables))
         return loss
     @tf.function
+    def validation_step(
+        self,
+        q_batch: tf.Tensor,
+        p_batch: tf.Tensor,
+        n_batch: tf.Tensor,
+        attention_mask: Optional[tf.Tensor] = None
+    ) -> tf.Tensor:
+        """
+        Single validation step with queries, positives, and negatives.
+        Uses the same loss calculation as train_step, but `training=False`.
+        """
         q_enc = self.encoder(q_batch, training=False)
         p_enc = self.encoder(p_batch, training=False)
+        shape = tf.shape(n_batch)
+        bs = shape[0]
+        neg_samples = shape[1]
+        n_batch_flat = tf.reshape(n_batch, [bs * neg_samples, shape[2]])
+        n_enc_flat = self.encoder(n_batch_flat, training=False)
+        n_enc = tf.reshape(n_enc_flat, [bs, neg_samples, -1])
+        combined_p_n = tf.concat(
+            [tf.expand_dims(p_enc, axis=1), n_enc],
+            axis=1
+        )
+        dot_products = tf.einsum('bd,bkd->bk', q_enc, combined_p_n)
+        labels = tf.zeros([bs], dtype=tf.int32)
         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            labels=labels,
+            logits=dot_products
         )
+        loss = tf.reduce_mean(loss)
         if attention_mask is not None:
             loss = loss * attention_mask
             loss = tf.reduce_sum(loss) / tf.reduce_sum(attention_mask)
         return loss
         conversation_parts.append(f"{self.special_tokens['user']} {query}")
         return "\n".join(conversation_parts)
+class TFDataPipeline:
     def __init__(
+        self,
+        embedding_batch_size,
+        tokenizer,
+        encoder,
+        index,
+        response_pool,
+        max_length: int,
+        neg_samples: int,
     ):
+        self.embedding_batch_size = embedding_batch_size
         self.tokenizer = tokenizer
         self.encoder = encoder
+        self.index = index  # CPU version of the index
         self.response_pool = response_pool
         self.max_length = max_length
         self.neg_samples = neg_samples
+        self.embedding_batch_size = 16 if len(response_pool) < 100 else 64
+        self.search_batch_size = 8 if len(response_pool) < 100 else 32
+        self.max_batch_size = 32 if len(response_pool) < 100 else 256
         self.memory_monitor = GPUMemoryMonitor()
         self.max_retries = 3
+        # In-memory cache for embeddings
+        self.query_embeddings_cache = {}
     def _extract_pairs_from_dialogue(self, dialogue: dict) -> List[Tuple[str, str]]:
         """Extract query-response pairs from a dialogue."""
         pairs = []
         return pairs
+    def estimate_total_pairs(self, dialogues: List[dict]) -> int:
+        """Estimate total number of training pairs including hard negatives."""
+        base_pairs = sum(
+            len([
+                1 for i in range(len(d.get('turns', [])) - 1)
+                if (d['turns'][i].get('speaker') == 'user' and
+                    d['turns'][i+1].get('speaker') == 'assistant')
+            ])
+            for d in dialogues
+        )
+        # Account for hard negatives
+        return base_pairs * (1 + self.neg_samples)
     def _find_hard_negatives_batch(self, queries: List[str], positives: List[str]) -> List[List[str]]:
         """Find hard negatives for a batch of queries with error handling and retries."""
         retry_count = 0
         total_responses = len(self.response_pool)
         while retry_count < self.max_retries:
             try:
                 query_embeddings = np.vstack([
                     self.query_embeddings_cache[q] for q in queries
                 ]).astype(np.float32)
+                query_embeddings = np.ascontiguousarray(query_embeddings)
                 faiss.normalize_L2(query_embeddings)
+                k = 1  # TODO: try higher k for better results
                 #logger.debug(f"Searching with k={k} among {total_responses} responses")
+                distances, indices = self.index.search(query_embeddings, k)
                 all_negatives = []
+                for query_indices, query, positive in zip(indices, queries, positives):
                     negatives = []
                     positive_strip = positive.strip()
                     seen = {positive_strip}
                     for idx in query_indices:
                         if idx >= 0 and idx < total_responses:
                             candidate = self.response_pool[idx].strip()
+                            if candidate and candidate not in seen:
                                 seen.add(candidate)
                                 negatives.append(candidate)
                                 if len(negatives) >= self.neg_samples:
                                     break
+                    # Pad with a special empty negative if necessary
                     while len(negatives) < self.neg_samples:
+                        negatives.append("<EMPTY_NEGATIVE>")  # Use a special token
                     all_negatives.append(negatives)
                 return all_negatives
             except Exception as e:
                 retry_count += 1
                 logger.warning(f"Hard negative search attempt {retry_count} failed: {e}")
                 if retry_count == self.max_retries:
                     logger.error("Max retries reached for hard negative search")
+                    return [["<EMPTY_NEGATIVE>"] * self.neg_samples for _ in queries]  # Return empty negatives for all queries
                 gc.collect()
                 if tf.config.list_physical_devices('GPU'):
                     tf.keras.backend.clear_session()
+    def _tokenize_negatives_tf(self, negatives):
+        """Tokenizes negatives using tf.py_function."""
+        # Handle the case where negatives is an empty tensor
+        if tf.size(negatives) == 0:
+            return tf.zeros([0, self.neg_samples, self.max_length], dtype=tf.int32)
+        # Convert EagerTensor to a list of strings
+        negatives_list = []
+        for neg_list in negatives.numpy():
+            decoded_negs = [neg.decode("utf-8") for neg in neg_list if neg]  # Filter out empty strings
+            negatives_list.append(decoded_negs)
+        # Flatten the list of lists
+        flattened_negatives = [neg for sublist in negatives_list for neg in sublist]
+        # Tokenize the flattened negatives
+        if flattened_negatives:
+            n_tokens = self.tokenizer(
+                flattened_negatives,
                 padding='max_length',
                 truncation=True,
                 max_length=self.max_length,
                 return_tensors='tf'
             )
+            # Reshape the tokens
+            n_tokens_reshaped = tf.reshape(n_tokens['input_ids'], [-1, self.neg_samples, self.max_length])
+            return n_tokens_reshaped
+        else:
+            return tf.zeros([0, self.neg_samples, self.max_length], dtype=tf.int32)
+    def _compute_embeddings(self, queries: List[str]) -> None:
+        """Computes and caches embeddings for new queries."""
+        new_queries = [q for q in queries if q not in self.query_embeddings_cache]
+        if not new_queries:
+            return  # All queries already cached
+        new_embeddings = []
+        for i in range(0, len(new_queries), self.embedding_batch_size):
+            batch_queries = new_queries[i:i + self.embedding_batch_size]
+            encoded = self.tokenizer(
+                batch_queries,
+                padding=True,
                 truncation=True,
                 max_length=self.max_length,
                 return_tensors='tf'
             )
+            # Compute embeddings on CPU
+            with tf.device('/CPU:0'):
+                batch_embeddings = self.encoder(encoded['input_ids'], training=False).numpy()
+            new_embeddings.extend(batch_embeddings)
+        # Update cache with new embeddings
+        for query, emb in zip(new_queries, new_embeddings):
+            self.query_embeddings_cache[query] = emb
+    def data_generator(self, dialogues: List[dict]) -> Generator[Tuple[str, str, List[str]], None, None]:
+        """
+        Generates training examples: (query, positive, hard_negatives).
+        Wrapped the outer loop with tqdm for progress tracking.
+        """
+        total_dialogues = len(dialogues)
+        logger.debug(f"Total dialogues to process: {total_dialogues}")
+        # Initialize tqdm progress bar
+        with tqdm(total=total_dialogues, desc="Processing Dialogues", unit="dialogue") as pbar:
+            for dialogue in dialogues:
+                pairs = self._extract_pairs_from_dialogue(dialogue)
+                for query, positive in pairs:
+                    # Ensure embeddings are computed, find hard negatives, etc.
+                    self._compute_embeddings([query])
+                    hard_negatives = self._find_hard_negatives_batch([query], [positive])[0]
+                    yield (query, positive, hard_negatives)
+                pbar.update(1)
+    def _prepare_batch(self, queries: tf.Tensor, positives: tf.Tensor, negatives: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+        """Prepares a batch of data for training."""
+        # Convert EagerTensors to lists of strings
+        queries_list = [query.decode("utf-8") for query in queries.numpy()]
+        positives_list = [pos.decode("utf-8") for pos in positives.numpy()]
+        # Tokenize queries and positives
+        q_tokens = self.tokenizer(queries_list, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
+        p_tokens = self.tokenizer(positives_list, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
+        # Decode negatives and ensure they are lists of strings
+        negatives_list = []
+        for neg_list in negatives.numpy():
+            decoded_negs = [neg.decode("utf-8") for neg in neg_list if neg]  # Filter out empty strings
+            negatives_list.append(decoded_negs)
+        # Flatten negatives for tokenization if there are any valid negatives
+        flattened_negatives = [neg for sublist in negatives_list for neg in sublist if neg]
+        # Tokenize negatives if there are any
+        n_tokens_reshaped = None
+        if flattened_negatives:
+            n_tokens = self.tokenizer(flattened_negatives, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='tf')
+            # Reshape n_tokens to match the expected shape based on the number of negatives per query
+            # This part may need adjustment if the number of negatives varies per query
+            n_tokens_reshaped = tf.reshape(n_tokens['input_ids'], [len(queries_list), -1, self.max_length])
+        else:
+            # Create a placeholder tensor for the case where there are no negatives
+            n_tokens_reshaped = tf.zeros([len(queries_list), 0, self.max_length], dtype=tf.int32)
+        # Ensure n_tokens_reshaped has a consistent shape even when there are no negatives
+        # Adjust shape to [batch_size, num_neg_samples, max_length]
+        if n_tokens_reshaped.shape[1] != self.neg_samples:
+            # Pad or truncate the second dimension to match neg_samples
+            padding = tf.zeros([len(queries_list), tf.maximum(0, self.neg_samples - n_tokens_reshaped.shape[1]), self.max_length], dtype=tf.int32)
+            n_tokens_reshaped = tf.concat([n_tokens_reshaped, padding], axis=1)
+            n_tokens_reshaped = n_tokens_reshaped[:, :self.neg_samples, :]
+        # Concatenate the positive and negative examples along the 'neg_samples' dimension
+        combined_p_n_tokens = tf.concat([tf.expand_dims(p_tokens['input_ids'], axis=1), n_tokens_reshaped], axis=1)
+        return q_tokens['input_ids'], combined_p_n_tokens
+    def get_tf_dataset(self, dialogues: List[dict], batch_size: int) -> tf.data.Dataset:
+        """
+        Creates a tf.data.Dataset for streaming training that yields
+        (input_ids_query, input_ids_positive, input_ids_negatives).
+        """
+        # 1) Start with a generator dataset
+        dataset = tf.data.Dataset.from_generator(
+            lambda: self.data_generator(dialogues),
+            output_signature=(
+                tf.TensorSpec(shape=(), dtype=tf.string),        # Query (single string)
+                tf.TensorSpec(shape=(), dtype=tf.string),        # Positive (single string)
+                tf.TensorSpec(shape=(None,), dtype=tf.string)    # Hard Negatives (list of strings)
+            )
         )
+        # 2) Batch the raw strings
+        dataset = dataset.batch(batch_size)
+        # 3) Now map them through a tokenize step (via py_function)
+        dataset = dataset.map(
+            lambda q, p, n: self._tokenize_triple(q, p, n),
+            num_parallel_calls=1 #tf.data.AUTOTUNE
+        )
+        dataset = dataset.prefetch(tf.data.AUTOTUNE)
+        return dataset
+    def _tokenize_triple(
+        self,
+        q: tf.Tensor,
+        p: tf.Tensor,
+        n: tf.Tensor
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        """
+        Wraps a Python function via tf.py_function to convert tf.Tensors of strings
+        -> Python lists of strings -> HF tokenizer -> Tensors of IDs.
+        q is shape [batch_size], p is shape [batch_size],
+        n is shape [batch_size, neg_samples] (i.e., each row is a list of negatives).
+        """
+        # Use tf.py_function with limited parallelism
+        q_ids, p_ids, n_ids = tf.py_function(
+            func=self._tokenize_triple_py,
+            inp=[q, p, n, tf.constant(self.max_length), tf.constant(self.neg_samples)],
+            Tout=[tf.int32, tf.int32, tf.int32]
+        )
+        # Manually set shape information
+        q_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
+        p_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
+        n_ids.set_shape([None, self.neg_samples, self.max_length])  # [batch_size, neg_samples, max_length]
+        return q_ids, p_ids, n_ids
+    # def _tokenize_triple(
+    #     self,
+    #     q: tf.Tensor,
+    #     p: tf.Tensor,
+    #     n: tf.Tensor
+    # ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+    #     """
+    #     Wraps a Python function via tf.py_function to convert tf.Tensors of strings
+    #     -> Python lists of strings -> HF tokenizer -> Tensors of IDs.
+    #     q is shape [batch_size], p is shape [batch_size],
+    #     n is shape [batch_size, None] (i.e. each row is a variable number of negatives).
+    #     """
+    #     # Use tf.py_function
+    #     # We pass in self.max_length as well, so we can do it in one shot.
+    #     q_ids, p_ids, n_ids = tf.py_function(
+    #         func=self._tokenize_triple_py,
+    #         inp=[q, p, n, tf.constant(self.max_length), tf.constant(self.neg_samples)],
+    #         Tout=[tf.int32, tf.int32, tf.int32]
+    #     )
+    #     # We must manually set shape information so that TF data pipeline knows the dimensions
+    #     q_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
+    #     p_ids.set_shape([None, self.max_length])                # [batch_size, max_length]
+    #     n_ids.set_shape([None, self.neg_samples, self.max_length])
+    #     # The negative dimension is set to `self.neg_samples` for consistency.
+    #     return q_ids, p_ids, n_ids
+    def _tokenize_triple_py(
+        self,
+        q: tf.Tensor,
+        p: tf.Tensor,
+        n: tf.Tensor,
+        max_len: tf.Tensor,
+        neg_samples: tf.Tensor
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Python function that:
+        - Decodes each tf.string Tensor to a Python list of strings
+        - Calls the HF tokenizer
+        - Reshapes negatives
+        - Returns np.array of int32s for (q_ids, p_ids, n_ids).
+        q: shape [batch_size], p: shape [batch_size]
+        n: shape [batch_size, neg_samples]
+        max_len: scalar int
+        neg_samples: scalar int
+        """
+        max_len = int(max_len.numpy())       # Convert to Python int
+        neg_samples = int(neg_samples.numpy())
+        # 1) Convert Tensors -> Python lists of strings
+        q_list = [q_i.decode("utf-8") for q_i in q.numpy()]  # shape [batch_size]
+        p_list = [p_i.decode("utf-8") for p_i in p.numpy()]  # shape [batch_size]
+        # shape [batch_size, neg_samples], decode each row
+        n_list = []
+        for row in n.numpy():
+            # row is shape [neg_samples], each is a tf.string
+            decoded = [neg.decode("utf-8") for neg in row]
+            n_list.append(decoded)
+        # 2) Tokenize queries & positives
+        q_enc = self.tokenizer(
+            q_list,
+            padding="max_length",
+            truncation=True,
+            max_length=max_len,
+            return_tensors="np"
+        )
+        p_enc = self.tokenizer(
+            p_list,
+            padding="max_length",
+            truncation=True,
+            max_length=max_len,
+            return_tensors="np"
+        )
+        # 3) Tokenize negatives
+        # Flatten [batch_size, neg_samples] -> single list
+        flattened_negatives = [neg for row in n_list for neg in row]
+        if len(flattened_negatives) == 0:
+            # No negatives at all: return a zero array
+            n_ids = np.zeros((len(q_list), neg_samples, max_len), dtype=np.int32)
+        else:
+            n_enc = self.tokenizer(
+                flattened_negatives,
+                padding="max_length",
+                truncation=True,
+                max_length=max_len,
+                return_tensors="np"
+            )
+            # shape [batch_size * neg_samples, max_len]
+            n_input_ids = n_enc["input_ids"]
+            # We want to reshape to [batch_size, neg_samples, max_len]
+            # Handle cases where there might be fewer negatives
+            batch_size = len(q_list)
+            n_ids_list = []
+            for i in range(batch_size):
+                start_idx = i * neg_samples
+                end_idx = start_idx + neg_samples
+                row_negs = n_input_ids[start_idx:end_idx]
+                # If fewer negatives, pad with zeros
+                if row_negs.shape[0] < neg_samples:
+                    deficit = neg_samples - row_negs.shape[0]
+                    pad_arr = np.zeros((deficit, max_len), dtype=np.int32)
+                    row_negs = np.concatenate([row_negs, pad_arr], axis=0)
+                n_ids_list.append(row_negs)
+            # stack them -> shape [batch_size, neg_samples, max_len]
+            n_ids = np.stack(n_ids_list, axis=0)
+        # 4) Return as np.int32 arrays
+        q_ids = q_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
+        p_ids = p_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
+        n_ids = n_ids.astype(np.int32)               # shape [batch_size, neg_samples, max_len]
+        return q_ids, p_ids, n_ids
+    # def _tokenize_triple_py(
+    #     self,
+    #     q: tf.Tensor,
+    #     p: tf.Tensor,
+    #     n: tf.Tensor,
+    #     max_len: tf.Tensor,
+    #     neg_samples: tf.Tensor
+    # ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    #     """
+    #     Python function that:
+    #      - Decodes each tf.string Tensor to a Python list of strings
+    #      - Calls the HF tokenizer
+    #      - Reshapes negatives
+    #      - Returns np.array of int32s for (q_ids, p_ids, n_ids).
+    #     q: shape [batch_size], p: shape [batch_size]
+    #     n: shape [batch_size, None]
+    #     max_len: scalar int
+    #     neg_samples: scalar int
+    #     """
+    #     max_len = int(max_len.numpy())       # convert to python int
+    #     neg_samples = int(neg_samples.numpy())
+    #     # 1) Convert Tensors -> Python lists of strings
+    #     q_list = [q_i.decode("utf-8") for q_i in q.numpy()]  # shape [batch_size]
+    #     p_list = [p_i.decode("utf-8") for p_i in p.numpy()]  # shape [batch_size]
+    #     # shape [batch_size, variable_negatives], decode each row
+    #     n_list = []
+    #     for row in n.numpy():
+    #         # row is shape [N], each is a tf.string
+    #         decoded = [neg.decode("utf-8") for neg in row]
+    #         n_list.append(decoded)
+    #     # 2) Tokenize queries & positives
+    #     q_enc = self.tokenizer(
+    #         q_list,
+    #         padding="max_length",
+    #         truncation=True,
+    #         max_length=max_len,
+    #         return_tensors="np"  # you can do return_tensors="tf", but "np" is often simpler here
+    #     )
+    #     p_enc = self.tokenizer(
+    #         p_list,
+    #         padding="max_length",
+    #         truncation=True,
+    #         max_length=max_len,
+    #         return_tensors="np"
+    #     )
+    #     # 3) Tokenize negatives
+    #     # Flatten [batch_size, variable_negatives] -> single list
+    #     flattened_negatives = [neg for row in n_list for neg in row]
+    #     if len(flattened_negatives) == 0:
+    #         # No negatives at all: return a zero array
+    #         n_ids = np.zeros((len(q_list), neg_samples, max_len), dtype=np.int32)
+    #     else:
+    #         n_enc = self.tokenizer(
+    #             flattened_negatives,
+    #             padding="max_length",
+    #             truncation=True,
+    #             max_length=max_len,
+    #             return_tensors="np"
+    #         )
+    #         # shape [batch_size * total_negatives, max_len]
+    #         n_input_ids = n_enc["input_ids"]
+    #         # We want to reshape to [batch_size, neg_samples, max_len].
+    #         # If each row truly has exactly `neg_samples` (or fewer), we can do:
+    #         #   n_input_ids = n_input_ids.reshape(len(q_list), neg_samples, max_len)
+    #         # But if the rows have variable # of negatives, we must clamp or pad.
+    #         # For simplicity, let's just "take first neg_samples" per row
+    #         # and pad if fewer.
+    #         # We'll do it row by row:
+    #         batch_size = len(q_list)
+    #         row_offsets = 0
+    #         n_ids_list = []
+    #         for row_idx in range(batch_size):
+    #             row_negs = n_list[row_idx]
+    #             row_count = len(row_negs)
+    #             # slice from the flattened array
+    #             row_slice = n_input_ids[row_offsets:row_offsets + row_count]
+    #             row_offsets += row_count
+    #             # Now pick out up to neg_samples
+    #             row_slice = row_slice[:neg_samples]
+    #             # If fewer than neg_samples, pad
+    #             if row_slice.shape[0] < neg_samples:
+    #                 deficit = neg_samples - row_slice.shape[0]
+    #                 pad_arr = np.zeros((deficit, max_len), dtype=np.int32)
+    #                 row_slice = np.concatenate([row_slice, pad_arr], axis=0)
+    #             # row_slice is now shape [neg_samples, max_len]
+    #             n_ids_list.append(row_slice)
+    #         # stack them -> shape [batch_size, neg_samples, max_len]
+    #         n_ids = np.stack(n_ids_list, axis=0)
+    #     # 4) Return as np.int32 arrays (tokenizer should already return int32,
+    #     #    but we can cast to be sure)
+    #     q_ids = q_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
+    #     p_ids = p_enc["input_ids"].astype(np.int32)  # shape [batch_size, max_len]
+    #     n_ids = n_ids.astype(np.int32)               # shape [batch_size, neg_samples, max_len]
+    #     return q_ids, p_ids, n_ids

deduplicate_augmented_dialogues.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import json
+from pathlib import Path
+import logging
+from typing import List, Dict
+from collections import defaultdict
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def load_json_file(file_path: str) -> List[Dict]:
+    """Load and parse a JSON file."""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except json.JSONDecodeError as e:
+        logger.error(f"Error parsing JSON from {file_path}: {e}")
+        return []
+    except Exception as e:
+        logger.error(f"Error reading file {file_path}: {e}")
+        return []
+def combine_json_files(input_directory: str, output_file: str):
+    """
+    Combine multiple JSON files while removing duplicates based on dialogue_id.
+    Args:
+        input_directory: Directory containing JSON files to process
+        output_file: Path to save the combined output
+    """
+    # Track unique dialogues and their source files
+    dialogue_map = {}
+    duplicate_count = 0
+    # Process all JSON files in the directory
+    input_path = Path(input_directory)
+    for json_file in input_path.glob('*.json'):
+        logger.info(f"Processing {json_file}")
+        data = load_json_file(str(json_file))
+        # Process each dialogue in the file
+        for dialogue in data:
+            dialogue_id = dialogue.get('dialogue_id')
+            if not dialogue_id:
+                logger.warning(f"Found dialogue without ID in {json_file}")
+                continue
+            # Keep the first occurrence
+            if dialogue_id in dialogue_map:
+                duplicate_count += 1
+                logger.debug(f"Duplicate dialogue_id found: {dialogue_id}")
+            else:
+                dialogue_map[dialogue_id] = dialogue
+    # Convert the map of unique dialogues back to a list
+    unique_dialogues = list(dialogue_map.values())
+    # Save combined dialogues to a new file
+    try:
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(unique_dialogues, f, indent=4)
+        logger.info(f"Successfully combined files. Found {duplicate_count} duplicates.")
+        logger.info(f"Total unique dialogues: {len(unique_dialogues)}")
+    except Exception as e:
+        logger.error(f"Error writing output file: {e}")
+# Usage example
+if __name__ == "__main__":
+    combine_json_files(
+        input_directory="/Users/joe/Desktop/Grad School/CSC525/CSC525_mod8_option2_joseph_armani/processed_outputs",
+        output_file="augmented_dialogues.json"
+    )

run_model_train.py CHANGED Viewed

@@ -5,7 +5,6 @@ from response_quality_checker import ResponseQualityChecker
 from chatbot_validator import ChatbotValidator
 from training_plotter import TrainingPlotter
 # Configure logging
 from logger_config import config_logger
 logger = config_logger(__name__)
@@ -38,7 +37,7 @@ def main():
     env = EnvironmentSetup()
     env.initialize()
-    DEBUG_SAMPLES = 15
     EPOCHS = 5 if DEBUG_SAMPLES else 20
     TRAINING_DATA_PATH = 'processed_outputs/batch_group_0010.json'
@@ -47,13 +46,14 @@ def main():
     # Initialize configuration
     config = ChatbotConfig(
-        embedding_dim=512, # 768,  # Match DistilBERT's dimension
         max_context_token_limit=512,
         freeze_embeddings=False,
     )
     # Load training data
     dialogues = RetrievalChatbot.load_training_data(data_path=TRAINING_DATA_PATH, debug_samples=DEBUG_SAMPLES)
     # Initialize chatbot and verify FAISS index
     #with env.strategy.scope():

 from chatbot_validator import ChatbotValidator
 from training_plotter import TrainingPlotter
 # Configure logging
 from logger_config import config_logger
 logger = config_logger(__name__)
     env = EnvironmentSetup()
     env.initialize()
+    DEBUG_SAMPLES = 5
     EPOCHS = 5 if DEBUG_SAMPLES else 20
     TRAINING_DATA_PATH = 'processed_outputs/batch_group_0010.json'
     # Initialize configuration
     config = ChatbotConfig(
+        embedding_dim=768, # DistilBERT
         max_context_token_limit=512,
         freeze_embeddings=False,
     )
     # Load training data
     dialogues = RetrievalChatbot.load_training_data(data_path=TRAINING_DATA_PATH, debug_samples=DEBUG_SAMPLES)
+    print(dialogues)
     # Initialize chatbot and verify FAISS index
     #with env.strategy.scope():