Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 12

Commit

4aec49f

1 Parent(s): d7fc7a7

cleanup

Browse files

Files changed (17) hide show

.gitignore +0 -5
data_augmentation_code/augmentation_processing_pipeline.py +0 -321
data_augmentation_code/back_translator.py +0 -87
data_augmentation_code/dialogue_augmenter.py +0 -710
data_augmentation_code/main.py +0 -112
data_augmentation_code/paraphraser.py +0 -42
data_augmentation_code/pipeline_config.py +0 -57
data_augmentation_code/quality_metrics.py +0 -47
data_augmentation_code/schema_guided_dialogue_processor.py +0 -192
data_augmentation_code/taskmaster_processor.py +0 -192
prepare_data.py +4 -4
run_chatbot_validation.py +1 -1
new_iteration/run_taskmaster_processor.py → run_taskmaster_processor.py +0 -0
new_iteration/taskmaster_processor.py → taskmaster_processor.py +0 -0
tf_data_pipeline.py +1 -1
unused/build_faiss_index.py +0 -160
unused/gpu_monitor.py +0 -59

.gitignore CHANGED Viewed

@@ -182,10 +182,5 @@ training_data/*
 !training_data/.gitkeep
 augmented_dialogues.json
-checkpoints_old_REMOVE/*
-new_iteration/cache/*
-new_iteration/data_prep_iterative_models/*
-new_iteration/training_data/*
-new_iteration/processed_outputs/*
 raw_datasets/*

 !training_data/.gitkeep
 augmented_dialogues.json
 raw_datasets/*

data_augmentation_code/augmentation_processing_pipeline.py DELETED Viewed

@@ -1,321 +0,0 @@
-from datetime import datetime
-from pathlib import Path
-from typing import List, Dict, Optional
-import json
-import re
-import hashlib
-import spacy
-import torch
-from tqdm import tqdm
-from data_augmentation.pipeline_config import PipelineConfig
-from data_augmentation.dialogue_augmenter import DialogueAugmenter
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-from typing import Set
-class AugmentationProcessingPipeline:
-    """
-    Complete pipeline combining validation, optimization, and augmentation.
-    """
-    def __init__(self, config: Optional[PipelineConfig] = None):
-        self.config = config or PipelineConfig()
-        self.nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
-        self.augmenter = DialogueAugmenter(self.nlp, self.config)
-        self.num_threads = self.config.batch_size
-        self.cache_dir = Path("./cache")
-        self.cache_dir.mkdir(exist_ok=True)
-        self.output_dir = Path("processed_outputs")
-        self.output_dir.mkdir(exist_ok=True)
-        self.checkpoint_file = self.output_dir / "processing_checkpoint.json"
-        self.batch_size = self.config.batch_size
-        self.use_gpu = torch.cuda.is_available()
-        self.batch_size = 32 if self.use_gpu else 8
-        self.use_multiprocessing = not self.use_gpu
-        # Counters for grouping batches
-        self.batch_counter = 0        # Count batches since last group combine
-        self.batch_group_number = 0   # How many groups have been created
-        if self.config.debug:
-            print(f"ProcessingPipeline initialized with:")
-            print(f"- GPU available: {self.use_gpu}")
-            print(f"- Batch size: {self.batch_size}")
-            print(f"- Using multiprocessing: {self.use_multiprocessing}")
-    def _save_batch(self, batch_results: List[Dict], batch_num: int) -> Path:
-        """Save a batch of results to a separate JSON file"""
-        batch_file = self.output_dir / f"batch_{batch_num:04d}.json"
-        with open(batch_file, 'w') as f:
-            json.dump(batch_results, f)
-        return batch_file
-    def _load_checkpoint(self) -> set:
-        """Load set of processed dialogue IDs from checkpoint"""
-        if self.checkpoint_file.exists():
-            with open(self.checkpoint_file, 'r') as f:
-                return set(json.load(f))
-        return set()
-    def _update_checkpoint(self, processed_ids: set):
-        """Update checkpoint with newly processed IDs"""
-        with open(self.checkpoint_file, 'w') as f:
-            json.dump(list(processed_ids), f)
-    def _process_batch(self, batch: List[Dict]) -> List[Dict]:
-        """Process batch with optimized model calls"""
-        results = []
-        try:
-            if self.use_gpu:
-                results = self.augmenter.process_batch(batch)
-            else:
-                # Collect all texts that need processing
-                all_texts = []
-                text_to_dialogue_map = {}
-                for dialogue in batch:
-                    for turn in dialogue['turns']:
-                        all_texts.append(turn['text'])
-                        text_to_dialogue_map[turn['text']] = dialogue['dialogue_id']
-                # Batch process embeddings
-                self.augmenter._compute_batch_embeddings(all_texts)
-                # Process dialogues with cached embeddings
-                for dialogue in batch:
-                    try:
-                        augmented = self.augmenter.augment_dialogue(dialogue)
-                        results.extend(augmented)
-                    except Exception as e:
-                        print(f"Error processing dialogue {dialogue.get('dialogue_id', 'unknown')}: {str(e)}")
-                        continue
-        except Exception as e:
-            print(f"Error processing batch: {str(e)}")
-        return results
-    def _combine_intermediate_batches(self):
-        """
-        Combine all current batch_*.json files into a single batch_group_XXXX.json file,
-        then remove the batch_*.json files.
-        """
-        batch_files = sorted(self.output_dir.glob("batch_*.json"))
-        if not batch_files:
-            return None  # No files to combine
-        combined_data = []
-        for bf in batch_files:
-            with open(bf, 'r') as f:
-                combined_data.extend(json.load(f))
-            bf.unlink()  # Remove the individual batch file after reading
-        self.batch_group_number += 1
-        group_file = self.output_dir / f"batch_group_{self.batch_group_number:04d}.json"
-        with open(group_file, 'w') as f:
-            json.dump(combined_data, f)
-        return group_file
-    def combine_results(self) -> Path:
-        """Combine all batch_group_*.json files into final output"""
-        all_results = []
-        group_files = sorted(self.output_dir.glob("batch_group_*.json"))
-        print(f"Combining {len(group_files)} group files...")
-        for group_file in tqdm(group_files):
-            with open(group_file, 'r') as f:
-                group_data = json.load(f)
-                all_results.extend(group_data)
-        # Save combined results
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        final_output = self.output_dir / f"augmented_dataset_{timestamp}.json"
-        with open(final_output, 'w') as f:
-            json.dump(all_results, f)
-        if self.config.debug:
-            print(f"Combined {len(all_results)} dialogues into {final_output}")
-        return final_output
-    def process_dataset(self, dialogues: List[Dict]) -> Path:
-        """Process dataset with hardware-appropriate optimizations and progress tracking"""
-        processed_ids = self._load_checkpoint()
-        # Filter out already processed dialogues
-        remaining_dialogues = [d for d in dialogues
-                            if d['dialogue_id'] not in processed_ids]
-        total_dialogues = len(dialogues)
-        remaining_count = len(remaining_dialogues)
-        processed_count = total_dialogues - remaining_count
-        print("\nDataset Processing Status:")
-        print(f"Total dialogues in dataset: {total_dialogues}")
-        print(f"Previously processed: {processed_count}")
-        print(f"Remaining to process: {remaining_count}")
-        print("-" * 50)
-        # Process in batches with progress bar
-        for batch_num in tqdm(range(0, len(remaining_dialogues), self.batch_size),
-                            desc="Processing batches",
-                            total=(len(remaining_dialogues) + self.batch_size - 1) // self.batch_size):
-            batch = remaining_dialogues[batch_num:batch_num + self.batch_size]
-            current_position = processed_count + batch_num + len(batch)
-            total_progress = (current_position / total_dialogues) * 100
-            print('\033[K', end='')
-            print(f"Processing: {current_position}/{total_dialogues} dialogues "
-                f"({total_progress:.1f}% complete)")
-            print(f"Current batch: {batch_num//self.batch_size + 1} of "
-                f"{(len(remaining_dialogues) + self.batch_size - 1) // self.batch_size}")
-            print("-" * 50)
-            # Process batch
-            batch_results = self._process_batch(batch)
-            if batch_results:
-                self._save_batch(batch_results, batch_num)
-                batch_ids = {d['dialogue_id'] for d in batch}
-                processed_ids.update(batch_ids)
-                self._update_checkpoint(processed_ids)
-            # Increment batch counter and combine if needed
-            self.batch_counter += 1
-            if self.batch_counter == 25:
-                # Combine these 25 batches into a group file
-                self._combine_intermediate_batches()
-                self.batch_counter = 0  # Reset counter after grouping
-        # If there are leftover batches less than 25
-        # combine them into one final group file
-        if self.batch_counter > 0:
-            self._combine_intermediate_batches()
-            self.batch_counter = 0
-        print("\n" + "-" * 50)
-        print("Processing complete. Combining results...")
-        return self.combine_results()
-    def cleanup(self):
-        """Clean up intermediate files after successful processing"""
-        # Clean up any leftover batch files (should not exist if logic is correct)
-        batch_files = list(self.output_dir.glob("batch_*.json"))
-        for file in batch_files:
-            try:
-                file.unlink()
-            except Exception as e:
-                print(f"Error deleting {file}: {e}")
-        # We can also remove batch_group_*.json if desired after final combine
-        # but that might not be necessary if we want to keep them.
-        if self.checkpoint_file.exists():
-            try:
-                self.checkpoint_file.unlink()
-            except Exception as e:
-                print(f"Error deleting checkpoint file: {e}")
-    def _deduplicate_dialogues(self, dialogues: List[Dict], threshold: float = 0.9) -> List[Dict]:
-        """
-        Deduplicate dialogues based on text similarity.
-        """
-        print("Deduplicating dialogues...")
-        if not dialogues:
-            print("No dialogues provided for deduplication.")
-            return []
-        # Combine turns into single text for similarity comparison
-        texts = [" ".join(turn['text'] for turn in dialogue['turns']) for dialogue in dialogues]
-        tfidf = TfidfVectorizer().fit_transform(texts)
-        sim_matrix = cosine_similarity(tfidf)
-        unique_indices = set()
-        for i, row in enumerate(sim_matrix):
-            if i not in unique_indices:
-                similar_indices = [j for j, sim in enumerate(row) if sim > threshold and j != i]
-                unique_indices.add(i)
-                unique_indices.difference_update(similar_indices)
-        deduplicated_dialogues = [dialogues[i] for i in unique_indices]
-        print(f"Deduplication complete. Reduced from {len(dialogues)} to {len(deduplicated_dialogues)} dialogues.")
-        return deduplicated_dialogues
-    def _validate_and_clean_dialogue(self, dialogue: Dict) -> Optional[Dict]:
-        """
-        Validate and clean a single dialogue.
-        """
-        try:
-            # Check required fields
-            if not all(field in dialogue for field in self.config.required_fields):
-                return None
-            # Process turns
-            cleaned_turns = []
-            for turn in dialogue['turns']:
-                if self._validate_turn(turn):
-                    cleaned_turn = {
-                        'speaker': turn['speaker'],
-                        'text': self._clean_text(turn['text'])
-                    }
-                    cleaned_turns.append(cleaned_turn)
-            if cleaned_turns:
-                return {
-                    'dialogue_id': dialogue['dialogue_id'],
-                    'turns': cleaned_turns
-                }
-            return None
-        except Exception as e:
-            print(f"Error processing dialogue {dialogue.get('dialogue_id', 'unknown')}: {str(e)}")
-            return None
-    def _validate_turn(self, turn: Dict) -> bool:
-        """
-        Validate a single speaking turn.
-        """
-        return (
-            turn['speaker'] in self.config.allowed_speakers and
-            self.config.min_length <= len(turn['text']) <= self.config.max_length
-        )
-    def _clean_text(self, text: str) -> str:
-        """
-        Clean and normalize text.
-        """
-        # Remove excessive whitespace
-        text = re.sub(r'\s+', ' ', text.strip())
-        # Normalize quotes and apostrophes
-        text = re.sub(r'[’´`]', "'", text)
-        text = re.sub(r'[“”]', '"', text)
-        # Remove control characters
-        text = "".join(char for char in text if ord(char) >= 32 or char == '\n')
-        return text
-    def _process_validation(self, items: List, func, description: str) -> List:
-        """
-        Process items sequentially with a progress bar.
-        """
-        results = []
-        print(f"Starting {description}")
-        for item in tqdm(items, desc=description):
-            try:
-                result = func(item)
-                if result is not None:
-                    results.append(result)
-            except Exception as e:
-                print(f"Error processing item: {str(e)}")
-        print(f"Completed {description}. Processed {len(results)} items successfully")
-        return results
-    def _get_cache_path(self, data: List[Dict]) -> Path:
-        """
-        Generate cache file path based on data hash.
-        """
-        data_str = json.dumps(data, sort_keys=True)
-        hash_value = hashlib.md5(data_str.encode()).hexdigest()
-        return self.cache_dir / f"cache_{hash_value}.pkl"

data_augmentation_code/back_translator.py DELETED Viewed

@@ -1,87 +0,0 @@
-from transformers import (
-    MarianMTModel,
-    MarianTokenizer,
-)
-# Retained for reference but removed from the final code.
-# This method did not seem helpful for this retrieval-based chatbot.
-class BackTranslator:
-    """
-    Perform Back-translation with pivot language. English -> German -> Spanish -> English
-    Args:
-        source_lang: Source language (default: 'en')
-        pivot_lang: Pivot language (default: 'de')
-        target_lang: Target language (default: 'es')
-    Examples:
-        back_translator = BackTranslator()
-        back_translator.back_translate("Hello, how are you?")
-    """
-    def __init__(self, source_lang='en', pivot_lang='de', target_lang='es'):
-        # Forward (English to German)
-        pivot_forward_model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{pivot_lang}'
-        self.tokenizer_pivot_forward = MarianTokenizer.from_pretrained(pivot_forward_model_name)
-        self.model_pivot_forward = MarianMTModel.from_pretrained(pivot_forward_model_name)
-        # Pivot translation (German to Spanish)
-        pivot_backward_model_name = f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}'
-        self.tokenizer_pivot_backward = MarianTokenizer.from_pretrained(pivot_backward_model_name)
-        self.model_pivot_backward = MarianMTModel.from_pretrained(pivot_backward_model_name)
-        # Backward (Spanish to English)
-        backward_model_name = f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
-        self.tokenizer_backward = MarianTokenizer.from_pretrained(backward_model_name)
-        self.model_backward = MarianMTModel.from_pretrained(backward_model_name)
-        # Set models to eval mode
-        self.model_pivot_forward.eval()
-        self.model_pivot_backward.eval()
-        self.model_backward.eval()
-    def back_translate(self, text, device=None):
-        try:
-            # Move models to device if specified
-            if device is not None:
-                self.model_pivot_forward = self.model_pivot_forward.to(device)
-                self.model_pivot_backward = self.model_pivot_backward.to(device)
-                self.model_backward = self.model_backward.to(device)
-            # Forward translation (English to German)
-            encoded_pivot = self.tokenizer_pivot_forward([text], padding=True,
-                                                       truncation=True, return_tensors='pt')
-            if device is not None:
-                encoded_pivot = {k: v.to(device) for k, v in encoded_pivot.items()}
-            generated_pivot = self.model_pivot_forward.generate(**encoded_pivot)
-            if device is not None:
-                generated_pivot = generated_pivot.cpu()
-            pivot_text = self.tokenizer_pivot_forward.batch_decode(generated_pivot,
-                                                                 skip_special_tokens=True)[0]
-            # Pivot translation (German to Spanish)
-            encoded_back_pivot = self.tokenizer_pivot_backward([pivot_text], padding=True,
-                                                             truncation=True, return_tensors='pt')
-            if device is not None:
-                encoded_back_pivot = {k: v.to(device) for k, v in encoded_back_pivot.items()}
-            retranslated_pivot = self.model_pivot_backward.generate(**encoded_back_pivot)
-            if device is not None:
-                retranslated_pivot = retranslated_pivot.cpu()
-            tgt_text_back = self.tokenizer_pivot_backward.batch_decode(retranslated_pivot,
-                                                                     skip_special_tokens=True)[0]
-            # Backward translation (Spanish to English)
-            encoded_back = self.tokenizer_backward([tgt_text_back], padding=True,
-                                                 truncation=True, return_tensors='pt')
-            if device is not None:
-                encoded_back = {k: v.to(device) for k, v in encoded_back.items()}
-            retranslated = self.model_backward.generate(**encoded_back)
-            if device is not None:
-                retranslated = retranslated.cpu()
-            src_text = self.tokenizer_backward.batch_decode(retranslated,
-                                                          skip_special_tokens=True)[0]
-            return src_text
-        except Exception as e:
-            print(f"Error in back translation: {e}")
-            return text

data_augmentation_code/dialogue_augmenter.py DELETED Viewed

@@ -1,710 +0,0 @@
-from typing import Dict, List
-import numpy as np
-import torch
-import tensorflow as tf
-import tensorflow_hub as hub
-from data_augmentation.pipeline_config import PipelineConfig
-from data_augmentation.quality_metrics import QualityMetrics
-from data_augmentation.paraphraser import Paraphraser
-import nlpaug.augmenter.word as naw
-from functools import lru_cache
-from sklearn.metrics.pairwise import cosine_similarity
-class DialogueAugmenter:
-    """
-    Optimized dialogue augmentation with quality control and complexity management.
-    """
-    def __init__(self, nlp, config: PipelineConfig):
-        self.nlp = nlp
-        self.config = config
-        # Detect hardware and set appropriate batch sizes and optimization strategy
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.use_gpu = torch.cuda.is_available()
-        if self.config.debug:
-            print(f"Using device: {self.device}")
-            if self.use_gpu:
-                print(f"GPU Device: {torch.cuda.get_device_name(0)}")
-        self.quality_metrics = QualityMetrics(config)
-        self.semantic_similarity_threshold = 0.75
-        # Load model
-        self.use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
-        # Initialize augmentation models based on hardware
-        self._initialize_augmentation_models()
-        # Initialize caches
-        self.embedding_cache = {}
-        # GPU memory management if available
-        if self.use_gpu:
-            gpus = tf.config.list_physical_devices('GPU')
-            if gpus:
-                try:
-                    for gpu in gpus:
-                        tf.config.experimental.set_memory_growth(gpu, True)
-                except RuntimeError as e:
-                    print(e)
-    def _initialize_augmentation_models(self):
-        """Initialize augmentation models with appropriate device settings"""
-        # Advanced augmentation techniques
-        self.paraphraser = Paraphraser()
-        if self.use_gpu:
-            # Move model to GPU if available
-            self.paraphraser.model = self.paraphraser.model.to(self.device)
-        # Basic augmentation techniques
-        self.word_augmenter = naw.SynonymAug(aug_src='wordnet')
-        self.augmenters = {
-            'advanced': [
-                self.paraphraser,
-            ],
-            'basic': [
-                ('synonym', self.word_augmenter),
-            ]
-        }
-    @lru_cache(maxsize=1024)
-    def _compute_embedding(self, text: str) -> np.ndarray:
-        """Cached computation of text embedding"""
-        if text in self.embedding_cache:
-            return self.embedding_cache[text]
-        embedding = self.use_model([text])[0].numpy()
-        self.embedding_cache[text] = embedding
-        return embedding
-    def _compute_batch_embeddings(self, texts: List[str]) -> np.ndarray:
-        """Compute embeddings for multiple texts at once with hardware optimization"""
-        # Check cache first
-        uncached_texts = [t for t in texts if t not in self.embedding_cache]
-        if uncached_texts:
-            embeddings = self.use_model(uncached_texts).numpy()
-            # Update cache
-            for text, embedding in zip(uncached_texts, embeddings):
-                self.embedding_cache[text] = embedding
-        # Return all embeddings (from cache or newly computed)
-        return np.array([self.embedding_cache[t] for t in texts])
-    def _quick_quality_check(self, variation: str, original: str) -> bool:
-        """
-        Preliminary quality check while maintaining reasonable pass rates
-        """
-        if self.config.debug:
-            print(f"\nQuick check for variation: {variation}")
-        orig_len = len(original.split())
-        var_len = len(variation.split())
-        # For very short texts (<= 3 words), still allow more variation
-        if orig_len <= 3:
-            if var_len > orig_len * 3:
-                if self.config.debug:
-                    print(f"Failed length check (short text): {var_len} vs {orig_len}")
-                return False
-        else:
-            if var_len > orig_len * 2:
-                if self.config.debug:
-                    print(f"Failed length check (long text): {var_len} vs {orig_len}")
-                return False
-        # Adjust content overlap check based on length
-        stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'is', 'are', 'that', 'this', 'will', 'can'}
-        orig_words = set(w.lower() for w in original.split() if w.lower() not in stop_words)
-        var_words = set(w.lower() for w in variation.split() if w.lower() not in stop_words)
-        # If very short turn (less than 5 words), skip the content overlap check
-        if orig_len >= 5:
-            content_overlap = len(orig_words.intersection(var_words)) / len(orig_words) if orig_words else 0
-            if content_overlap < 0.2:
-                if self.config.debug:
-                    print(f"Failed content check: overlap {content_overlap:.2f}")
-                return False
-        else:
-            if self.config.debug:
-                print("Short turn detected (<5 words), skipping content overlap check")
-        if self.config.debug:
-            print("Passed all quick checks")
-        return True
-    def _filter_variations_batch(self, variations: List[str], context: List[str], original_turn: str) -> List[str]:
-        """
-        Filter variations using batched computations with detailed logging
-        """
-        if not variations:
-            return []
-        if self.config.debug:
-            print(f"\nStarting filtration of {len(variations)} variations")
-            print(f"Context length: {len(context)}")
-            print(f"Original turn: {original_turn}")
-        words = original_turn.split()
-        orig_len = len(words)
-        # If very short text, consider adjusting thresholds
-        is_very_short = orig_len < 5
-        if len(words) < 3:
-            if self.config.debug:
-                print("Short text detected, using predefined variations")
-            short_text_variations = self._augment_short_text({'text': original_turn, 'speaker': ''})
-            return [var['text'] for var in short_text_variations]
-        # If this is the first turn (no context), be more lenient
-        if not context:
-            preliminary_filtered = variations
-            if self.config.debug:
-                print("First turn - skipping preliminary filtering")
-        else:
-            # Quick preliminary filtering against original turn
-            preliminary_filtered = []
-            for var in variations:
-                passed = self._quick_quality_check(var, original_turn)
-                if self.config.debug:
-                    print(f"\nVariation: {var}")
-                    print(f"Passed quick check: {passed}")
-                if passed:
-                    preliminary_filtered.append(var)
-        if self.config.debug:
-            print(f"Variations after quick check: {len(preliminary_filtered)}")
-        if not preliminary_filtered:
-            return []
-        # Compute embeddings for original and variations
-        original_embedding = self._compute_embedding(original_turn)
-        variation_embeddings = self._compute_batch_embeddings(preliminary_filtered)
-        # Compute similarities
-        sims = cosine_similarity([original_embedding], variation_embeddings)[0]
-        # If very short turn, slightly lower the semantic similarity threshold
-        dynamic_sem_threshold = self.semantic_similarity_threshold
-        if is_very_short:
-            dynamic_sem_threshold = max(0.7, self.semantic_similarity_threshold - 0.05)
-        # Filter by semantic similarity threshold
-        refined_filtered = []
-        for var, sim in zip(preliminary_filtered, sims):
-            if sim >= dynamic_sem_threshold:
-                refined_filtered.append(var)
-            else:
-                if self.config.debug:
-                    print(f"Variation '{var}' discarded due to low semantic similarity: {sim:.3f}")
-        if not refined_filtered:
-            return []
-        # Relax context coherence thresholds further if desired
-        # We already have min_similarity = 0.1, min_coherence = 0.05
-        # Let's lower them slightly more if the turn is very short:
-        if is_very_short:
-            min_similarity = 0.05
-            min_coherence = 0.02
-        else:
-            min_similarity = 0.1
-            min_coherence = 0.05
-        # Only use last turn for coherence
-        recent_context = [context[-1]] if context else []
-        context_text = ' '.join(recent_context) if recent_context else ''
-        if context_text:
-            if self.config.debug:
-                print(f"\nContext text: {context_text}")
-            all_texts = [context_text] + refined_filtered
-            all_embeddings = self._compute_batch_embeddings(all_texts)
-            context_embedding = all_embeddings[0]
-            variation_embeddings = all_embeddings[1:]
-            # Vectorized similarity computation
-            context_similarities = cosine_similarity([context_embedding], variation_embeddings)[0]
-            # Response coherence check
-            if recent_context:
-                prev_embedding = self._compute_embedding(recent_context[-1])
-                response_coherence = cosine_similarity([prev_embedding], variation_embeddings)[0]
-            else:
-                response_coherence = np.ones_like(context_similarities)
-            filtered_variations = []
-            for i, (variation, sim, coh) in enumerate(zip(
-                refined_filtered, context_similarities, response_coherence)):
-                combined_score = (
-                    self.config.context_similarity_weight * abs(sim) +
-                    self.config.response_coherence_weight * abs(coh)
-                )
-                if self.config.debug:
-                    print(f"\nVariation: {variation}")
-                    print(f"Context similarity: {sim:.3f}")
-                    print(f"Response coherence: {coh:.3f}")
-                    print(f"Combined score: {combined_score:.3f}")
-                # Accept if EITHER score is good enough
-                if (combined_score >= min_similarity or abs(coh) >= min_coherence):
-                    filtered_variations.append(variation)
-                    if self.config.debug:
-                        print("ACCEPTED")
-                else:
-                    if self.config.debug:
-                        print("REJECTED")
-                # If we have enough variations, stop
-                if len(filtered_variations) >= self.config.max_variations_per_turn:
-                    break
-        else:
-            filtered_variations = refined_filtered[:self.config.max_variations_per_turn]
-        if self.config.debug:
-            print(f"\nFinal filtered variations: {len(filtered_variations)}")
-        return filtered_variations
-    def _generate_variations_progressive(self, text: str, needed: int) -> List[str]:
-        """
-        Generate variations progressively until we have enough good ones.
-        Adjust paraphraser parameters for closer paraphrases as needed.
-        """
-        variations = set()
-        if self.config.debug:
-            print(f"\nAttempting to generate {needed} variations for text: {text}")
-        # Fine-tune paraphraser here if needed: fewer beams, less diversity already done
-        for augmenter in self.augmenters['advanced']:
-            if len(variations) >= needed:
-                break
-            try:
-                if isinstance(augmenter, Paraphraser):
-                    if self.config.debug:
-                        print("Trying paraphrase augmentation...")
-                    new_vars = augmenter.paraphrase(
-                        text,
-                        num_return_sequences=needed-len(variations),
-                        device=self.device if self.use_gpu else None,
-                        num_beams=4,          # even fewer beams for more faithful paraphrases
-                        num_beam_groups=1,
-                        diversity_penalty=0.0
-                    )
-                    if self.config.debug:
-                        print(f"Paraphraser generated {len(new_vars)} variations")
-                valid_vars = [v for v in new_vars if v.strip() and v != text]
-                variations.update(valid_vars)
-                if self.config.debug:
-                    print(f"Current unique variations: {len(variations)}")
-            except Exception as e:
-                print(f"Error in advanced augmentation: {str(e)}")
-                continue
-        # Try basic augmenters if needed
-        if len(variations) < needed:
-            if self.config.debug:
-                print("Not enough variations, trying basic augmenters...")
-            for aug_type, augmenter in self.augmenters['basic']:
-                if len(variations) >= needed:
-                    break
-                try:
-                    if self.config.debug:
-                        print(f"Trying {aug_type} augmentation...")
-                    new_vars = augmenter.augment(text, n=2)
-                    if isinstance(new_vars, list):
-                        valid_vars = [v for v in new_vars if v.strip() and v != text]
-                        variations.update(valid_vars)
-                    else:
-                        if new_vars.strip() and new_vars != text:
-                            variations.add(new_vars)
-                    if self.config.debug:
-                        print(f"After {aug_type}, total variations: {len(variations)}")
-                except Exception as e:
-                    print(f"Error in {aug_type} augmentation: {str(e)}")
-                    continue
-        variations_list = list(variations)
-        if self.config.debug:
-            print(f"Final number of variations generated: {len(variations_list)}")
-            if not variations_list:
-                print("WARNING: No variations were generated!")
-        return variations_list
-    def augment_dialogue(self, dialogue: Dict) -> List[Dict]:
-        """
-        Create augmented versions of the dialogue with optimized processing
-        """
-        # Early dialogue length check
-        original_length = len(dialogue['turns'])
-        if original_length > self.config.max_turns_per_dialogue:
-            if self.config.debug:
-                print(f"Truncating dialogue from {original_length} to {self.config.max_turns_per_dialogue} turns")
-            dialogue['turns'] = dialogue['turns'][:self.config.max_turns_per_dialogue]
-        turn_variations = []
-        context = []
-        # Process each turn with progressive generation
-        for turn in dialogue['turns']:
-            original_text = turn['text']  # Store original turn text
-            variations = self._generate_variations_progressive(
-                original_text,
-                self.config.max_variations_per_turn
-            )
-            # Batch filter variations with original text
-            filtered_variations = self._filter_variations_batch(
-                variations,
-                context,
-                original_text  # Pass the original turn text
-            )
-            # Create turn variations with speaker info
-            turn_vars = [{'speaker': turn['speaker'], 'text': v} for v in filtered_variations]
-            if self.config.debug:
-                print(f"Turn {len(turn_variations)}: Generated {len(turn_vars)} variations")
-            turn_variations.append(turn_vars)
-            context.append(original_text)
-        # Generate combinations with sampling
-        augmented_dialogues = self._generate_dialogue_combinations(
-            dialogue['dialogue_id'],
-            turn_variations,
-            dialogue
-        )
-        # Add original dialogue
-        result = [{
-            'dialogue_id': f"{dialogue['dialogue_id']}_original",
-            'turns': dialogue['turns']
-        }]
-        # Add unique augmentations
-        result.extend(augmented_dialogues[:self.config.augmentation_factor])
-        if self.config.debug:
-            print(f"Generated {len(result)-1} unique augmented dialogues")
-        return result
-    def _variation_score(self, original: str, variation: str) -> float:
-        """
-        Compute a single numeric score for a variation to guide selection.
-        You could use semantic similarity, content preservation, etc.
-        Higher is better.
-        """
-        metrics = self.quality_metrics.compute_metrics(original, variation)
-        # Example: Primarily semantic similarity, with a slight boost for content preservation
-        # Adjust as needed.
-        score = metrics['semantic_similarity'] * 0.7 + metrics['content_preservation'] * 0.3
-        return score
-    def _dialogue_quality_score(self, dialogue: Dict, original_dialogue: Dict) -> float:
-        """
-        Compute a quality score for the entire augmented dialogue.
-        For example, average semantic similarity of turns to the original turns.
-        This is done after the dialogue is formed.
-        """
-        original_texts = [t['text'] for t in original_dialogue['turns']]
-        aug_texts = [t['text'] for t in dialogue['turns']]
-        # Compute semantic similarity turn-by-turn and average it
-        scores = []
-        for orig, aug in zip(original_texts, aug_texts):
-            # Simple semantic similarity for scoring
-            emb_orig = self._compute_embedding(orig)
-            emb_aug = self._compute_embedding(aug)
-            sim = (emb_orig @ emb_aug) / (np.linalg.norm(emb_orig)*np.linalg.norm(emb_aug))
-            scores.append(sim)
-        # Could also incorporate diversity checks, content overlap, etc.
-        return float(np.mean(scores)) if scores else 0.0
-    def _generate_dialogue_combinations(self, dialogue_id: str, turn_variations: List[List[Dict]], original_dialogue: Dict) -> List[Dict]:
-        """
-        Generate dialogue combinations using a more controlled approach:
-        - Include the original turn as a fallback variation for each turn.
-        - Sort variations by a quality score.
-        - Ensure a balanced augmentation by requiring at least some turns to be augmented.
-        - Over-generate and then select top dialogues by quality.
-        """
-        # Over-generate factor: create more candidates than needed
-        over_generate_factor = self.config.augmentation_factor * 2
-        # Add the original turn as a fallback variation for each turn if not present
-        for i, turn_variants in enumerate(turn_variations):
-            original_turn_text = None
-            # Check if we previously stored original turn text with a marker or just use the original dialogue
-            # If you previously used "|ORIGINAL|" marker, handle it here. Otherwise, just get from original_dialogue.
-            original_turn_text = original_dialogue['turns'][i]['text']
-            # Add the original turn as a variation if not already included
-            if not any(v['text'] == original_turn_text for v in turn_variants):
-                turn_variants.append({
-                    'speaker': original_dialogue['turns'][i]['speaker'],
-                    'text': original_turn_text
-                })
-            # Sort variations by score
-            original_text = original_dialogue['turns'][i]['text']
-            turn_variants.sort(key=lambda v: self._variation_score(original_text, v['text']), reverse=True)
-        augmented_dialogues = []
-        used_combinations = set()
-        def generate_candidates(current_turns=None, turn_index=0):
-            if current_turns is None:
-                current_turns = []
-            if len(augmented_dialogues) >= over_generate_factor:
-                return
-            if turn_index == len(turn_variations):
-                # Completed a candidate dialogue
-                dialogue_fingerprint = " | ".join(turn['text'] for turn in current_turns)
-                if dialogue_fingerprint not in used_combinations:
-                    used_combinations.add(dialogue_fingerprint)
-                    # Check if we have enough augmented turns
-                    aug_count = sum(1 for orig, curr in zip(original_dialogue['turns'], current_turns)
-                                    if orig['text'] != curr['text'])
-                    # Require at least half the turns to be augmented, for example
-                    if aug_count >= max(1, len(turn_variations)//2):
-                        augmented_dialogues.append({
-                            'dialogue_id': f"{dialogue_id}_aug_{len(augmented_dialogues)}",
-                            'turns': current_turns.copy()
-                        })
-                return
-            turn_candidates = turn_variations[turn_index]
-            # If no variations are available for this turn, let's just return without error.
-            # Normally, this shouldn't happen since we always add the original turn above.
-            if not turn_candidates:
-                # If you want to at least have the original turn, add it now:
-                original_text = original_dialogue['turns'][turn_index]['text']
-                turn_candidates.append({
-                    'speaker': original_dialogue['turns'][turn_index]['speaker'],
-                    'text': original_text
-                })
-            # After the fallback, if still empty for some reason, just return.
-            if not turn_candidates:
-                return
-            # Example strategy:
-            # 1. Always try the top variation (most semantically similar).
-            # 2. If available and allowed, pick a mid-ranked variation for diversity.
-            # 3. Include the original turn if not selected yet.
-            num_vars = min(self.config.max_sampled_variations, len(turn_candidates))
-            # Always include top variation
-            candidates_to_pick = [turn_candidates[0]]
-            # If we have more than 2 variations and can pick more, add a middle variation for diversity
-            if len(turn_candidates) > 2 and num_vars > 1:
-                mid_index = len(turn_candidates)//2
-                candidates_to_pick.append(turn_candidates[mid_index])
-            # If we still have room for another variation, try adding the original turn if not included
-            if num_vars > len(candidates_to_pick):
-                original_turn_text = original_dialogue['turns'][turn_index]['text']
-                orig_candidate = next((v for v in turn_candidates if v['text'] == original_turn_text), None)
-                if orig_candidate and orig_candidate not in candidates_to_pick:
-                    candidates_to_pick.append(orig_candidate)
-            # Shuffle candidates to produce different dialogues
-            np.random.shuffle(candidates_to_pick)
-            for variation in candidates_to_pick:
-                if len(augmented_dialogues) >= over_generate_factor:
-                    return
-                current_turns.append(variation)
-                generate_candidates(current_turns, turn_index + 1)
-                current_turns.pop()
-        try:
-            generate_candidates()
-        except Exception as e:
-            print(f"Error in dialogue generation: {str(e)}")
-            return []
-        # Over-generated set of augmented dialogues is now available
-        # Let's score them and pick the top ones
-        scored_dialogues = []
-        for d in augmented_dialogues:
-            score = self._dialogue_quality_score(d, original_dialogue)
-            scored_dialogues.append((score, d))
-        scored_dialogues.sort(key=lambda x: x[0], reverse=True)
-        # Pick top `augmentation_factor` dialogues
-        final_dialogues = [d for _, d in scored_dialogues[:self.config.augmentation_factor]]
-        return final_dialogues
-    # def _generate_dialogue_combinations(self, dialogue_id: str, turn_variations: List[List[Dict]]) -> List[Dict]:
-    #     """
-    #     Generate dialogue combinations using sampling
-    #     """
-    #     augmented_dialogues = []
-    #     used_combinations = set()
-    #     def generate_dialogues(current_turns=None, turn_index=0):
-    #         if current_turns is None:
-    #             current_turns = []
-    #         if len(augmented_dialogues) >= self.config.augmentation_factor:
-    #             return
-    #         if turn_index == len(turn_variations):
-    #             dialogue_fingerprint = " | ".join(turn['text'] for turn in current_turns)
-    #             if dialogue_fingerprint not in used_combinations:
-    #                 used_combinations.add(dialogue_fingerprint)
-    #                 augmented_dialogues.append({
-    #                     'dialogue_id': f"{dialogue_id}_aug_{len(augmented_dialogues)}",
-    #                     'turns': current_turns.copy()
-    #                 })
-    #             return
-    #         variations = list(turn_variations[turn_index])
-    #         np.random.shuffle(variations)
-    #         for variation in variations[:self.config.max_sampled_variations]:
-    #             if len(augmented_dialogues) >= self.config.augmentation_factor:
-    #                 return
-    #             current_turns.append(variation)
-    #             generate_dialogues(current_turns, turn_index + 1)
-    #             current_turns.pop()
-    #     try:
-    #         generate_dialogues()
-    #     except Exception as e:
-    #         print(f"Error in dialogue generation: {str(e)}")
-    #         return []
-    #     return augmented_dialogues
-    def _is_dialogue_duplicate(self, dialogue1: Dict, dialogue2: Dict) -> bool:
-        """
-        Check if two dialogues are duplicates.
-        """
-        text1 = " ".join(turn['text'] for turn in dialogue1['turns'])
-        text2 = " ".join(turn['text'] for turn in dialogue2['turns'])
-        return text1 == text2
-    def _augment_short_text(self, turn: Dict) -> List[Dict]:
-        """
-        Special handling for very short texts with predefined variations.
-        If predefined variations are found, return them directly.
-        Otherwise, produce simple punctuation and capitalization variants.
-        Skip heavy quality checks for efficiency. These variations are safe and minimal.
-        """
-        text = turn['text']
-        common_variations = {
-            'goodbye': [
-                'Bye!', 'Farewell!', 'See you!', 'Take care!',
-                'Goodbye!', 'Bye for now!', 'Until next time!'
-            ],
-            'hello': [
-                'Hi!', 'Hey!', 'Hello!', 'Greetings!',
-                'Good day!', 'Hi there!', 'Hello there!'
-            ],
-            'yes': [
-                'Yes!', 'Correct!', 'Indeed!', 'Absolutely!',
-                'That\'s right!', 'Definitely!', 'Sure!'
-            ],
-            'no': [
-                'No!', 'Nope!', 'Not at all!', 'Negative!',
-                'Unfortunately not!', 'I\'m afraid not!'
-            ],
-            'thanks': [
-                'Thank you!', 'Thanks a lot!', 'Many thanks!',
-                'I appreciate it!', 'Thank you so much!'
-            ],
-            'ok': [
-                'Okay!', 'Alright!', 'Sure!', 'Got it!',
-                'Understood!', 'Fine!', 'Great!', 'Perfect!',
-                'That works!', 'Sounds good!'
-            ],
-            'good': [
-                'Great!', 'Excellent!', 'Perfect!', 'Wonderful!',
-                'Fantastic!', 'Amazing!', 'Terrific!'
-            ]
-        }
-        text_lower = text.lower().rstrip('!.,?')
-        # Check if text matches any predefined category
-        variations = []
-        for key, predefined_vars in common_variations.items():
-            if key in text_lower or text_lower in key:
-                variations.extend(predefined_vars)
-        if not variations:
-            # Generate simple punctuation and capitalization variations if no predefined match
-            base = text.rstrip('!.,?')
-            variations = [
-                base + '!',
-                base + '.',
-                base
-            ]
-            # Add capitalization variations
-            capitalized = [v.capitalize() for v in variations if v.capitalize() not in variations]
-            variations.extend(capitalized)
-        # Ensure uniqueness
-        unique_variations = list(set(variations))
-        # Directly return these variations, as they are minimal and trusted
-        # No further quality checks are needed
-        result_variations = unique_variations[:self.config.augmentation_factor]
-        return [{'speaker': turn['speaker'], 'text': v} for v in result_variations]
-    def process_batch(self, batch: List[Dict]) -> List[Dict]:
-        """Process multiple dialogues at once to maximize GPU utilization"""
-        results = []
-        # Pre-compute embeddings for all texts in batch
-        all_texts = []
-        text_to_embedding = {}
-        for dialogue in batch:
-            for turn in dialogue['turns']:
-                all_texts.append(turn['text'])
-        # Batch compute embeddings
-        if all_texts:
-            embeddings = self._compute_batch_embeddings(all_texts)
-            for text, embedding in zip(all_texts, embeddings):
-                self.embedding_cache[text] = embedding
-        # Process each dialogue using cached embeddings
-        for dialogue in batch:
-            try:
-                augmented = self.augment_dialogue(dialogue)
-                results.extend(augmented)
-            except Exception as e:
-                print(f"Error processing dialogue {dialogue.get('dialogue_id', 'unknown')}: {e}")
-                continue
-        return results

data_augmentation_code/main.py DELETED Viewed

@@ -1,112 +0,0 @@
-"""
-CSC525 - Module 8 Option 2 - Joseph Armani
-Description and References in the README.md file.
-"""
-import json
-import tensorflow as tf
-from typing import List, Dict
-from data_augmentation.pipeline_config import PipelineConfig
-from data_augmentation.augmentation_processing_pipeline import AugmentationProcessingPipeline
-from data_augmentation.taskmaster_processor import TaskmasterProcessor
-from data_augmentation.schema_guided_dialogue_processor import SchemaGuidedProcessor
-def combine_datasets(taskmaster_dialogues: List[Dict],
-                    schema_guided_dialogues: List[Dict]) -> List[Dict]:
-    """
-    Combine dialogues from both datasets into a single list
-    Args:
-        taskmaster_dialogues: List of dialogues in pipeline format from Taskmaster
-        schema_guided_dialogues: List of dialogues in pipeline format from Schema-Guided
-    Returns:
-        List[Dict]: Combined list of dialogues
-    """
-    # Ensure unique dialogue IDs
-    combined_dialogues = []
-    seen_ids = set()
-    duplicate_count = 0  # Track duplicates for reporting
-    for dialogue in taskmaster_dialogues:
-        dialogue_copy = dialogue.copy()
-        dialogue_id = dialogue_copy['dialogue_id']
-        if dialogue_id in seen_ids:
-            duplicate_count += 1
-            dialogue_id = f"taskmaster_{dialogue_id}"
-        seen_ids.add(dialogue_id)
-        dialogue_copy['dialogue_id'] = dialogue_id
-        combined_dialogues.append(dialogue_copy)
-    for dialogue in schema_guided_dialogues:
-        dialogue_copy = dialogue.copy()
-        dialogue_id = dialogue_copy['dialogue_id']
-        if dialogue_id in seen_ids:
-            duplicate_count += 1
-            dialogue_id = f"schema_guided_{dialogue_id}"
-        seen_ids.add(dialogue_id)
-        dialogue_copy['dialogue_id'] = dialogue_id
-        combined_dialogues.append(dialogue_copy)
-    # Log the results
-    print(f"Combine Datasets: Found and resolved {duplicate_count} duplicate dialogue IDs.")
-    print(f"Combine Datasets: Total dialogues combined: {len(combined_dialogues)}")
-    return combined_dialogues
-def main():
-    # Configuration
-    config = PipelineConfig(
-        min_length=1,
-        max_length=512,
-        batch_size=32 if tf.config.list_physical_devices('GPU') else 16,
-        max_turns_per_dialogue=12,
-        max_variations_per_turn=4,
-        max_sampled_variations=2,
-        context_window_size=4,
-        max_complexity_threshold=100,
-        use_cache=False,
-        debug=True,
-        allowed_speakers=['user', 'assistant'],
-        required_fields=['dialogue_id', 'turns']
-    )
-    try:
-        # Set max_examples (Optional[int]) for testing
-        max_examples = 5
-        # Initialize and load Taskmaster dataset
-        print("Loading Taskmaster dataset")
-        taskmaster_processor = TaskmasterProcessor(config, use_ontology=False)
-        taskmaster_dialogues = taskmaster_processor.load_dataset('./datasets/taskmaster', max_examples=max_examples)
-        taskmaster_pipeline_dialogues = taskmaster_processor.convert_to_pipeline_format(taskmaster_dialogues)
-        print(f"Processed Taskmaster dialogues: {len(taskmaster_pipeline_dialogues)}")
-        # Initialize and load Schema-Guided dataset
-        print("Loading Schema-Guided dataset")
-        schema_dialogue_processor = SchemaGuidedProcessor(config)
-        schema_dialogues = schema_dialogue_processor.load_dataset('./datasets/schema_guided', max_examples=max_examples)
-        schema_pipeline_dialogues = schema_dialogue_processor.convert_to_pipeline_format(schema_dialogues)
-        print(f"Processed Schema-Guided dialogues: {len(schema_pipeline_dialogues)}")
-        # Combine datasets
-        print("Combining datasets")
-        combined_dialogues = combine_datasets(taskmaster_pipeline_dialogues, schema_pipeline_dialogues)
-        print(f"Combined Dialogues: {len(combined_dialogues)}")
-        if not combined_dialogues:
-            print("Combined dialogues are empty. Exiting.")
-            return
-        # Process through augmentation pipeline
-        print("Processing combined dataset")
-        pipeline = AugmentationProcessingPipeline(config)
-        output_path = pipeline.process_dataset(combined_dialogues)
-        print(f"Processing complete. Results saved to {output_path}")
-        pipeline.cleanup()
-    except Exception as e:
-        print(f"Processing failed: {str(e)}")
-        raise
-if __name__ == "__main__":
-    main()

data_augmentation_code/paraphraser.py DELETED Viewed

@@ -1,42 +0,0 @@
-from transformers import (
-    AutoTokenizer,
-    AutoModelForSeq2SeqLM,
-)
-class Paraphraser:
-    def __init__(self, model_name='humarin/chatgpt_paraphraser_on_T5_base'):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        self.model.eval()
-    def paraphrase(self, text, num_return_sequences=5, num_beams=5,
-                  num_beam_groups=1, diversity_penalty=0.0, device=None):
-        try:
-            input_text = "paraphrase: " + text + " </s>"
-            encoding = self.tokenizer.encode_plus(input_text, return_tensors="pt")
-            # Move input tensors to specified device if provided
-            if device is not None:
-                input_ids = encoding["input_ids"].to(device)
-                self.model = self.model.to(device)
-            else:
-                input_ids = encoding["input_ids"]
-            outputs = self.model.generate(
-                input_ids=input_ids,
-                max_length=256,
-                num_beams=num_beams,
-                num_beam_groups=num_beam_groups,
-                num_return_sequences=num_return_sequences,
-                diversity_penalty=diversity_penalty,
-                early_stopping=True
-            )
-            # Move outputs back to CPU for tokenizer decoding
-            outputs = outputs.cpu() if device is not None else outputs
-            paraphrases = [self.tokenizer.decode(output, skip_special_tokens=True)
-                          for output in outputs]
-            return paraphrases
-        except Exception as e:
-            print(f"Error in paraphrasing: {e}")
-            return []

data_augmentation_code/pipeline_config.py DELETED Viewed

@@ -1,57 +0,0 @@
-from dataclasses import dataclass
-from typing import List
-@dataclass
-class PipelineConfig:
-    """
-    Config for the pipeline
-    """
-    # Validation settings
-    min_length: int = 1
-    max_length: int = 512
-    min_tokens: int = 1
-    max_tokens: int = 128
-    allowed_speakers: List[str] = None
-    required_fields: List[str] = None
-    # Text augmentation settings
-    augmentation_factor: int = 4
-    augmentation_techniques: List[str] = None
-    max_turns_per_dialogue: int = 6
-    max_variations_per_turn: int = 3
-    max_sampled_variations: int = 2
-    max_complexity_threshold: int = 100
-    complexity_reduction_turns: int = 4
-    # Quality thresholds
-    semantic_similarity_threshold: float = 0.45
-    grammar_error_threshold: int = 2
-    rouge1_f1_threshold: float = 0.30
-    rouge2_f1_threshold: float = 0.15
-    # Response coherence thresholds
-    min_response_coherence: float = 0.3
-    context_similarity_weight: float = 0.35
-    response_coherence_weight: float = 0.65
-    # Performance settings
-    batch_size: int = 32
-    use_cache: bool = True
-    debug: bool = False
-    context_window_size: int = 4
-    def __post_init__(self):
-        if self.allowed_speakers is None:
-            self.allowed_speakers = ['user', 'assistant']
-        if self.required_fields is None:
-            self.required_fields = ['dialogue_id', 'turns']
-        if self.augmentation_techniques is None:
-            self.augmentation_techniques = ['paraphrase', 'back_translation']
-        # Validate weights sum to 1.0
-        if abs((self.context_similarity_weight + self.response_coherence_weight) - 1.0) > 1e-6:
-            raise ValueError("Context similarity and response coherence weights must sum to 1.0")

data_augmentation_code/quality_metrics.py DELETED Viewed

@@ -1,47 +0,0 @@
-import tensorflow_hub as hub
-import spacy
-from sklearn.metrics.pairwise import cosine_similarity
-from typing import Dict
-from data_augmentation.pipeline_config import PipelineConfig
-class QualityMetrics:
-    """
-    Quality metrics focusing on semantic similarity and basic lexical stats.
-    """
-    def __init__(self, config: PipelineConfig):
-        self.config = config
-        self.use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
-        self.nlp = spacy.load('en_core_web_md')
-    def compute_semantic_similarity(self, text1: str, text2: str) -> float:
-        embeddings = self.use_model([text1, text2])
-        emb1, emb2 = embeddings[0].numpy(), embeddings[1].numpy()
-        return cosine_similarity([emb1], [emb2])[0][0]
-    def compute_metrics(self, original: str, augmented: str) -> Dict[str, float]:
-        metrics = {}
-        # Semantic similarity
-        embeddings = self.use_model([original, augmented])
-        emb_orig, emb_aug = embeddings[0].numpy(), embeddings[1].numpy()
-        metrics['semantic_similarity'] = cosine_similarity([emb_orig], [emb_aug])[0][0]
-        # Lexical diversity & content preservation
-        doc_orig = self.nlp(original)
-        doc_aug = self.nlp(augmented)
-        aug_tokens = [token.text.lower() for token in doc_aug]
-        metrics['type_token_ratio'] = len(set(aug_tokens)) / max(len(aug_tokens), 1)
-        orig_content = {token.text.lower() for token in doc_orig if not token.is_stop}
-        aug_content = {token.text.lower() for token in doc_aug if not token.is_stop}
-        if len(orig_content) == 0:
-            metrics['content_preservation'] = 1.0 if len(aug_content) == 0 else 0.0
-        else:
-            metrics['content_preservation'] = len(orig_content.intersection(aug_content)) / len(orig_content)
-        # Length ratio
-        orig_words = len(original.split())
-        aug_words = len(augmented.split())
-        metrics['length_ratio'] = aug_words / max(orig_words, 1)
-        return metrics

data_augmentation_code/schema_guided_dialogue_processor.py DELETED Viewed

@@ -1,192 +0,0 @@
-from dataclasses import dataclass, field
-from typing import List, Dict, Optional, Any
-import json
-import glob
-from pathlib import Path
-from data_augmentation.pipeline_config import PipelineConfig
-@dataclass
-class SchemaGuidedDialogue:
-    """
-    Structured representation of a Schema-Guided dialogue
-    """
-    dialogue_id: str
-    service_name: str
-    service_description: Optional[str]
-    schema: Dict[str, Any]
-    turns: List[Dict[str, Any]]
-    original_metadata: Dict[str, Any] = field(default_factory=dict)
-class SchemaGuidedProcessor:
-    """
-    Handles processing and preparation of Schema-Guided dataset dialogues
-    """
-    def __init__(self, config: PipelineConfig):
-        self.config = config
-        self.services = set()
-        self.domains = set()
-        self.schemas = {}
-    def load_dataset(self, base_dir, max_examples: Optional[int] = None) -> List[SchemaGuidedDialogue]:
-        """
-        Load and parse Schema-Guided Dialogue dataset
-        Args:
-            dialogue_path: Path to the dialogue JSON file
-            schema_path: Path to the schema JSON file
-        """
-        # Define schema and dialogue file patterns
-        schema_file = Path(base_dir, "schema.json")
-        dialogue_files_pattern = str(Path(base_dir, "dialogues_*.json"))
-        # Check for schema file
-        if not schema_file.exists():
-            raise FileNotFoundError(f"Schema file not found at {schema_file}")
-        # Load schema
-        self.schemas = self._load_schemas(schema_file)
-        # Find and validate dialogue files
-        dialogue_files = glob.glob(dialogue_files_pattern)
-        if not dialogue_files:
-            raise FileNotFoundError(f"No dialogue files found matching pattern {dialogue_files_pattern}")
-        print(f"Found {len(dialogue_files)} dialogue files to process.")
-        # Process all dialogues
-        processed_dialogues = []
-        for file_path in dialogue_files:
-            with open(file_path, 'r', encoding='utf-8') as f:
-                raw_dialogues = json.load(f)
-            for dialogue in raw_dialogues:
-                processed_dialogues.append(self._process_single_dialogue(dialogue))
-                if max_examples and len(processed_dialogues) >= max_examples:
-                    break
-        return processed_dialogues
-    def _process_single_dialogue(self, dialogue: Dict[str, Any]) -> SchemaGuidedDialogue:
-        """
-        Process a single dialogue JSON object into a SchemaGuidedDialogue object.
-        """
-        dialogue_id = str(dialogue.get("dialogue_id", ""))
-        services = dialogue.get("services", [])
-        service_name = services[0] if services else None
-        schema = self.schemas.get(service_name, {})
-        service_description = schema.get("description", "")
-        # Process turns
-        turns = self._process_turns(dialogue.get("turns", []))
-        # Store metadata
-        metadata = {
-            "services": services,
-            "original_id": dialogue_id,
-        }
-        return SchemaGuidedDialogue(
-            dialogue_id=f"schema_guided_{dialogue_id}",
-            service_name=service_name,
-            service_description=service_description,
-            schema=schema,
-            turns=turns,
-            original_metadata=metadata,
-        )
-    def _validate_schema(self, schema: Dict[str, Any]) -> bool:
-        """
-        Validate a schema
-        """
-        required_keys = {"service_name", "description", "slots", "intents"}
-        missing_keys = required_keys - schema.keys()
-        if missing_keys:
-            print(f"Warning: Missing keys in schema {schema.get('service_name', 'unknown')}: {missing_keys}")
-            return False
-        return True
-    def _load_schemas(self, schema_path: str) -> Dict[str, Any]:
-        """
-        Load and process service schemas
-        """
-        with open(schema_path, 'r', encoding='utf-8') as f:
-            schemas = json.load(f)
-        # Validate and index schemas
-        return {
-            schema["service_name"]: schema for schema in schemas if self._validate_schema(schema)
-        }
-    def _process_turns(self, turns: List[Dict]) -> List[Dict]:
-        """
-        Process dialogue turns into standardized format
-        """
-        processed_turns = []
-        for turn in turns:
-            try:
-                # Map speakers to standard format
-                speaker = 'assistant' if turn.get('speaker') == 'SYSTEM' else 'user'
-                # Extract utterance and clean it
-                text = turn.get('utterance', '').strip()
-                # Extract frames and dialogue acts
-                frames = turn.get('frames', [])
-                acts = []
-                slots = []
-                for frame in frames:
-                    if 'actions' in frame:
-                        acts.extend(frame['actions'])
-                    if 'slots' in frame:
-                        slots.extend(frame['slots'])
-                # Create the processed turn
-                processed_turn = {
-                    'speaker': speaker,
-                    'text': text,
-                    'original_speaker': turn.get('speaker', ''),
-                    'dialogue_acts': acts,
-                    'slots': slots,
-                    'metadata': {k: v for k, v in turn.items()
-                            if k not in {'speaker', 'utterance', 'frames'}}
-                }
-                processed_turns.append(processed_turn)
-            except Exception as e:
-                print(f"Error processing turn: {str(e)}")
-                continue
-        return processed_turns
-    def convert_to_pipeline_format(self, schema_dialogues: List[SchemaGuidedDialogue]) -> List[Dict]:
-        """
-        Convert SchemaGuidedDialogues to the format expected by the ProcessingPipeline
-        """
-        pipeline_dialogues = []
-        for dialogue in schema_dialogues:
-            # Convert turns to the expected format
-            processed_turns = [
-                {"speaker": turn["speaker"], "text": turn["text"]}
-                for turn in dialogue.turns if turn["text"].strip()
-            ]
-            # Create dialogue in pipeline format
-            pipeline_dialogue = {
-                'dialogue_id': dialogue.dialogue_id,
-                'turns': processed_turns,
-                'metadata': {
-                    'service_name': dialogue.service_name,
-                    'service_description': dialogue.service_description,
-                    'schema': dialogue.schema,
-                    **dialogue.original_metadata
-                }
-            }
-            pipeline_dialogues.append(pipeline_dialogue)
-        return pipeline_dialogues

data_augmentation_code/taskmaster_processor.py DELETED Viewed

@@ -1,192 +0,0 @@
-from dataclasses import dataclass, field
-from typing import List, Dict, Optional, Any
-import json
-import re
-from pathlib import Path
-from data_augmentation.pipeline_config import PipelineConfig
-@dataclass
-class TaskmasterDialogue:
-    """
-    Structured representation of a Taskmaster dialogue
-    """
-    conversation_id: str
-    instruction_id: Optional[str]
-    scenario: Optional[str]
-    domain: Optional[str]
-    turns: List[Dict[str, Any]]
-    original_metadata: Dict[str, Any] = field(default_factory=dict)
-    def __str__(self):
-        return f"TaskmasterDialogue(conversation_id={self.conversation_id}, turns={len(self.turns)} turns)"
-    def validate(self) -> bool:
-        return bool(self.conversation_id and isinstance(self.turns, list))
-class TaskmasterProcessor:
-    """
-    Handles processing and preparation of Taskmaster dataset dialogues
-    """
-    config: PipelineConfig
-    use_ontology: bool = False  # Whether to load and use ontology
-    ontology: Optional[Dict[str, Any]] = None  # Holds ontology data if loaded
-    domains: set = field(default_factory=set)  # Tracks unique domains
-    scenarios: set = field(default_factory=set)  # Tracks unique scenarios
-    def __init__(self, config: PipelineConfig, use_ontology: bool = False):
-        self.config = config
-        self.use_ontology = use_ontology
-        self.ontology = None
-        self.domains = set()
-        self.scenarios = set()
-    def load_dataset(self, base_dir: str, max_examples: Optional[int] = None) -> List[TaskmasterDialogue]:
-        """
-        Load and parse Taskmaster JSON dataset.
-        Handles self-dialogs, woz-dialogs, and ontology files.
-        """
-        required_files = {
-            "self-dialogs": "self-dialogs.json",
-            "woz-dialogs": "woz-dialogs.json",
-            "ontology": "ontology.json",
-        }
-        # Check for required files
-        missing_files = [name for name, path in required_files.items() if not Path(base_dir, path).exists()]
-        if missing_files:
-            raise FileNotFoundError(f"Missing required taskmaster files: {missing_files}")
-        # load ontology
-        ontology_path = Path(base_dir, required_files['ontology'])
-        with open(ontology_path, 'r', encoding='utf-8') as f:
-            self.ontology = json.load(f)
-        processed_dialogues = []
-        for file_key in ["self-dialogs", "woz-dialogs"]:
-            file_path = Path(base_dir, required_files[file_key])
-            with open(file_path, 'r', encoding='utf-8') as f:
-                raw_data = json.load(f)
-            for dialogue in raw_data:
-                # Extract core dialogue components
-                conversation_id = dialogue.get('conversation_id', '')
-                instruction_id = dialogue.get('instruction_id', None)
-                if 'utterances' in dialogue:
-                    turns = self._process_utterances(dialogue['utterances'])
-                    scenario = dialogue.get('scenario', '')
-                    domain = self._extract_domain(scenario)
-                else:
-                    turns = []
-                    scenario = ''
-                    domain = ''
-                # Store metadata
-                metadata = {k: v for k, v in dialogue.items()
-                            if k not in {'conversation_id', 'instruction_id', 'utterances'}}
-                # Create structured dialogue object
-                processed_dialogue = TaskmasterDialogue(
-                    conversation_id=conversation_id,
-                    instruction_id=instruction_id,
-                    scenario=scenario,
-                    domain=domain,
-                    turns=turns,
-                    original_metadata=metadata
-                )
-                processed_dialogues.append(processed_dialogue)
-                # Update domain and scenario tracking
-                if domain:
-                    self.domains.add(domain)
-                if scenario:
-                    self.scenarios.add(scenario)
-                if max_examples and len(processed_dialogues) >= max_examples:
-                    break
-        return processed_dialogues
-    def _process_utterances(self, utterances: List[Dict]) -> List[Dict]:
-        """
-        Process utterances into a standardized format
-        """
-        processed_turns = []
-        for utterance in utterances:
-            # Map Taskmaster speaker roles to your expected format
-            speaker = 'assistant' if utterance.get('speaker') == 'ASSISTANT' else 'user'
-            # Extract and clean the text
-            text = utterance.get('text', '').strip()
-            # Extract any segments or annotations if present
-            segments = utterance.get('segments', [])
-            # Create the processed turn
-            turn = {
-                'speaker': speaker,
-                'text': text,
-                'original_speaker': utterance.get('speaker', ''),
-                'segments': segments,
-                'metadata': {k: v for k, v in utterance.items()
-                           if k not in {'speaker', 'text', 'segments'}}
-            }
-            processed_turns.append(turn)
-        return processed_turns
-    def _extract_domain(self, scenario: str) -> str:
-        """
-        Extract domain from scenario description
-        """
-        domain_patterns = {
-            'restaurant': r'\b(restaurant|dining|food|reservation)\b',
-            'movie': r'\b(movie|cinema|film|ticket)\b',
-            'ride_share': r'\b(ride|taxi|uber|lyft)\b',
-            'coffee': r'\b(coffee|café|cafe|starbucks)\b',
-            'pizza': r'\b(pizza|delivery|order food)\b',
-            'auto': r'\b(car|vehicle|repair|maintenance)\b',
-        }
-        scenario_lower = scenario.lower()
-        for domain, pattern in domain_patterns.items():
-            if re.search(pattern, scenario_lower):
-                return domain
-        return 'other'
-    def convert_to_pipeline_format(self, taskmaster_dialogues: List[TaskmasterDialogue]) -> List[Dict]:
-        """
-        Convert TaskmasterDialogues to the format expected by the ProcessingPipeline
-        """
-        pipeline_dialogues = []
-        for dialogue in taskmaster_dialogues:
-            # Convert turns to the expected format
-            processed_turns = []
-            for turn in dialogue.turns:
-                if turn['text'].strip():  # Skip empty turns
-                    processed_turns.append({
-                        'speaker': turn['speaker'],
-                        'text': turn['text']
-                    })
-            # Create dialogue in pipeline format
-            pipeline_dialogue = {
-                'dialogue_id': dialogue.conversation_id,
-                'turns': processed_turns,
-                'metadata': {
-                    'instruction_id': dialogue.instruction_id,
-                    'scenario': dialogue.scenario,
-                    'domain': dialogue.domain,
-                    **dialogue.original_metadata
-                }
-            }
-            pipeline_dialogues.append(pipeline_dialogue)
-        return pipeline_dialogues

prepare_data.py CHANGED Viewed

@@ -16,12 +16,12 @@ logger = config_logger(__name__)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 def main():
-    MODELS_DIR = 'new_iteration/data_prep_iterative_models'
-    PROCESSED_DATA_DIR = 'new_iteration/processed_outputs'
-    CACHE_DIR = 'new_iteration/cache'
     TOKENIZER_DIR = os.path.join(MODELS_DIR, 'tokenizer')
     FAISS_INDICES_DIR = os.path.join(MODELS_DIR, 'faiss_indices')
-    TF_RECORD_DIR = 'new_iteration/training_data'
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_production.index')
     JSON_TRAINING_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, 'taskmaster_dialogues.json')
     CACHE_FILE = os.path.join(CACHE_DIR, 'query_embeddings_cache.pkl')

 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 def main():
+    MODELS_DIR = 'models'
+    PROCESSED_DATA_DIR = 'processed_outputs'
+    CACHE_DIR = os.path.join(MODELS_DIR, 'query_embeddings_cache')
     TOKENIZER_DIR = os.path.join(MODELS_DIR, 'tokenizer')
     FAISS_INDICES_DIR = os.path.join(MODELS_DIR, 'faiss_indices')
+    TF_RECORD_DIR = 'training_data'
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, 'faiss_index_production.index')
     JSON_TRAINING_DATA_PATH = os.path.join(PROCESSED_DATA_DIR, 'taskmaster_dialogues.json')
     CACHE_FILE = os.path.join(CACHE_DIR, 'query_embeddings_cache.pkl')

run_chatbot_validation.py CHANGED Viewed

@@ -44,7 +44,7 @@ def run_chatbot_validation():
     env = EnvironmentSetup()
     env.initialize()
-    MODEL_DIR = "new_iteration/data_prep_iterative_models"
     FAISS_INDICES_DIR = os.path.join(MODEL_DIR, "faiss_indices")
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_production.index")
     FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_test.index")

     env = EnvironmentSetup()
     env.initialize()
+    MODEL_DIR = "models"
     FAISS_INDICES_DIR = os.path.join(MODEL_DIR, "faiss_indices")
     FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_production.index")
     FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_test.index")

new_iteration/run_taskmaster_processor.py → run_taskmaster_processor.py RENAMED Viewed

File without changes

new_iteration/taskmaster_processor.py → taskmaster_processor.py RENAMED Viewed

File without changes

tf_data_pipeline.py CHANGED Viewed

@@ -29,7 +29,7 @@ class TFDataPipeline:
         max_length: int = 512,
         neg_samples: int = 10,
         index_type: str = 'IndexFlatIP',
-        faiss_index_file_path: str = 'new_iteration/data_prep_iterative_models/faiss_indices/faiss_index_production.index',
         nlist: int = 100,
         max_retries: int = 3
     ):

         max_length: int = 512,
         neg_samples: int = 10,
         index_type: str = 'IndexFlatIP',
+        faiss_index_file_path: str = 'models/faiss_indices/faiss_index_production.index',
         nlist: int = 100,
         max_retries: int = 3
     ):

unused/build_faiss_index.py DELETED Viewed

@@ -1,160 +0,0 @@
-# import os
-# import json
-# from pathlib import Path
-# import faiss
-# import numpy as np
-# import tensorflow as tf
-# from transformers import AutoTokenizer, TFAutoModel
-# from tqdm.auto import tqdm
-# from chatbot_model import ChatbotConfig, EncoderModel
-# from tf_data_pipeline import TFDataPipeline
-# from logger_config import config_logger
-# logger = config_logger(__name__)
-# os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# def sanity_check(encoder: EncoderModel, tokenizer: AutoTokenizer, config: ChatbotConfig):
-#     """
-#     Perform a quick sanity check to ensure the model is loaded correctly.
-#     """
-#     sample_response = "This is a test response."
-#     encoded_sample = tokenizer(
-#         [sample_response],
-#         padding=True,
-#         truncation=True,
-#         max_length=config.max_context_token_limit,
-#         return_tensors='tf'
-#     )
-#     # Get embedding
-#     sample_embedding = encoder(encoded_sample['input_ids'], training=False).numpy()
-#     # Check shape
-#     if sample_embedding.shape[1] != config.embedding_dim:
-#         logger.error(
-#             f"Embedding dimension mismatch: Expected {config.embedding_dim}, "
-#             f"got {sample_embedding.shape[1]}"
-#         )
-#         raise ValueError("Embedding dimension mismatch.")
-#     else:
-#         logger.info("Embedding dimension matches the configuration.")
-#     # Check normalization
-#     embedding_norm = np.linalg.norm(sample_embedding, axis=1)
-#     if not np.allclose(embedding_norm, 1.0, atol=1e-5):
-#         logger.error("Embeddings are not properly normalized.")
-#         raise ValueError("Embeddings are not normalized.")
-#     else:
-#         logger.info("Embeddings are properly normalized.")
-#     logger.info("Sanity check passed: Model loaded correctly and outputs are as expected.")
-# def build_faiss_index():
-#     """
-#     Rebuild the FAISS index by:
-#       1) Loading your config.json
-#       2) Initializing encoder + loading submodule & custom weights
-#       3) Loading tokenizer from disk
-#       4) Creating a TFDataPipeline
-#       5) Setting the pipeline's response_pool from a JSON file
-#       6) Using pipeline.compute_and_index_response_embeddings()
-#       7) Saving the FAISS index
-#     """
-#     # Directories
-#     MODELS_DIR = Path("models")
-#     FAISS_DIR = MODELS_DIR / "faiss_indices"
-#     FAISS_INDEX_PATH = FAISS_DIR / "faiss_index_production.index"
-#     RESPONSES_PATH = FAISS_DIR / "faiss_index_production_responses.json"
-#     TOKENIZER_DIR = MODELS_DIR / "tokenizer"
-#     SHARED_ENCODER_DIR = MODELS_DIR / "shared_encoder"
-#     CUSTOM_WEIGHTS_PATH = MODELS_DIR / "encoder_custom_weights.weights.h5"
-#     # 1) Load ChatbotConfig
-#     config_path = MODELS_DIR / "config.json"
-#     if config_path.exists():
-#         with open(config_path, "r", encoding="utf-8") as f:
-#             config_dict = json.load(f)
-#         config = ChatbotConfig.from_dict(config_dict)
-#         logger.info(f"Loaded ChatbotConfig from {config_path}")
-#     else:
-#         config = ChatbotConfig()
-#         logger.warning(f"No config.json found at {config_path}. Using default ChatbotConfig.")
-#     # 2) Initialize the EncoderModel
-#     encoder = EncoderModel(config=config)
-#     logger.info("EncoderModel instantiated (empty).")
-#     # Overwrite the submodule from 'shared_encoder' directory
-#     if SHARED_ENCODER_DIR.exists():
-#         logger.info(f"Loading DistilBERT submodule from {SHARED_ENCODER_DIR}...")
-#         encoder.pretrained = TFAutoModel.from_pretrained(str(SHARED_ENCODER_DIR))
-#         logger.info("Loaded HF submodule into encoder.pretrained.")
-#     else:
-#         logger.warning(f"No shared_encoder directory at {SHARED_ENCODER_DIR}. Using default pretrained model.")
-#     # Build model once, then load custom weights (projection, etc.)
-#     dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
-#     _ = encoder(dummy_input, training=False)  # builds the layers
-#     if CUSTOM_WEIGHTS_PATH.exists():
-#         logger.info(f"Loading custom top-level weights from {CUSTOM_WEIGHTS_PATH}")
-#         encoder.load_weights(str(CUSTOM_WEIGHTS_PATH))
-#         logger.info("Custom top-level weights loaded successfully.")
-#     else:
-#         logger.warning(f"Custom weights file not found at {CUSTOM_WEIGHTS_PATH}.")
-#     # 3) Load tokenizer
-#     if TOKENIZER_DIR.exists():
-#         logger.info(f"Loading tokenizer from {TOKENIZER_DIR}")
-#         tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_DIR))
-#     else:
-#         logger.warning(f"No tokenizer dir at {TOKENIZER_DIR}, falling back to default HF tokenizer.")
-#         tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
-#     # 4) Quick sanity check
-#     sanity_check(encoder, tokenizer, config)
-#     # 5) Prepare a TFDataPipeline
-#     pipeline = TFDataPipeline(
-#         config=config,
-#         tokenizer=tokenizer,
-#         encoder=encoder,
-#         index_file_path=str(FAISS_INDEX_PATH),
-#         response_pool=[],
-#         max_length=config.max_context_token_limit,
-#         query_embeddings_cache={},
-#         neg_samples=config.neg_samples,
-#         index_type='IndexFlatIP',
-#         nlist=100,
-#         max_retries=config.max_retries
-#     )
-#     # 6) Load the existing response pool
-#     if not RESPONSES_PATH.exists():
-#         logger.error(f"Response pool JSON file not found at {RESPONSES_PATH}")
-#         raise FileNotFoundError(f"No response pool JSON at {RESPONSES_PATH}")
-#     with open(RESPONSES_PATH, "r", encoding="utf-8") as f:
-#         response_pool = json.load(f)
-#     logger.info(f"Loaded {len(response_pool)} responses from {RESPONSES_PATH}")
-#     pipeline.response_pool = response_pool  # assign to pipeline
-#     # 7) Build (or rebuild) the FAISS index from pipeline method
-#     #    This does all the compute-embeddings + index.add in one place
-#     logger.info("Starting to compute and index response embeddings via TFDataPipeline...")
-#     pipeline.compute_and_index_response_embeddings()
-#     # 8) Save the rebuilt FAISS index
-#     pipeline.save_faiss_index(str(FAISS_INDEX_PATH))
-#     # Verify
-#     loaded_index = faiss.read_index(str(FAISS_INDEX_PATH))
-#     logger.info(f"Verified the rebuilt FAISS index has {loaded_index.ntotal} vectors.")
-#     return loaded_index, pipeline.response_pool
-# if __name__ == "__main__":
-#     build_faiss_index()

unused/gpu_monitor.py DELETED Viewed

@@ -1,59 +0,0 @@
-import tensorflow as tf
-from typing import List, Dict, Optional
-from dataclasses import dataclass
-from tqdm.auto import tqdm
-@dataclass
-class GPUMemoryStats:
-    total: int
-    used: int
-    free: int
-class GPUMemoryMonitor:
-    """Monitor GPU memory usage with safe CPU fallback."""
-    def __init__(self):
-        self.has_gpu = False
-        try:
-            gpus = tf.config.list_physical_devices('GPU')
-            self.has_gpu = len(gpus) > 0
-        except:
-            pass
-    def get_memory_stats(self) -> Optional[GPUMemoryStats]:
-        """Get current GPU memory statistics."""
-        if not self.has_gpu:
-            return None
-        try:
-            memory_info = tf.config.experimental.get_memory_info('GPU:0')
-            return GPUMemoryStats(
-                total=memory_info['peak'],
-                used=memory_info['current'],
-                free=memory_info['peak'] - memory_info['current']
-            )
-        except:
-            return None
-    def get_memory_usage(self) -> float:
-        """Get current GPU memory usage as a percentage."""
-        if not self.has_gpu:
-            return 0.0
-        stats = self.get_memory_stats()
-        if stats is None or stats.total == 0:
-            return 0.0
-        return stats.used / stats.total
-    def should_reduce_batch_size(self) -> bool:
-        """Check if batch size should be reduced based on memory usage."""
-        if not self.has_gpu:
-            return False
-        usage = self.get_memory_usage()
-        return usage > 0.90
-    def can_increase_batch_size(self) -> bool:
-        """Check if batch size can be increased based on memory usage."""
-        if not self.has_gpu:
-            return True
-        usage = self.get_memory_usage()
-        return usage < 0.70