Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Dec 14, 2024

Commit

2a3cfd8

2 Parent(s): cf97979 098eba4

Merge branch 'dev'

Browse files

Files changed (14) hide show

.gitignore +2 -6
augmented_combined_dataset.json +0 -0
back_translator.py +56 -0
dialogue_augmenter.py +716 -0
main.py +124 -0
paraphraser.py +31 -0
pipeline_config.py +58 -0
processing_pipeline.py +176 -0
quality_metrics.py +129 -0
readme.md +43 -0
requirements.txt +12 -0
schema_guided_dialogue_processor.py +192 -0
setup.py +100 -0
taskmaster_processor.py +192 -0

.gitignore CHANGED Viewed

@@ -154,9 +154,5 @@ dmypy.json
 # Cython debug symbols
 cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/

 # Cython debug symbols
 cython_debug/
+datasets/*
+!datasets/.gitkeep

augmented_combined_dataset.json ADDED Viewed

The diff for this file is too large to render. See raw diff

back_translator.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from transformers import (
+    MarianMTModel,
+    MarianTokenizer,
+)
+class BackTranslator:
+    """
+    Perform Back-translation with pivot language. English -> German -> Spanish -> English
+    Args:
+        source_lang: Source language (default: 'en')
+        pivot_lang: Pivot language (default: 'de')
+        target_lang: Target language (default: 'es')
+    Examples:
+        back_translator = BackTranslator()
+        back_translator.back_translate("Hello, how are you?")
+    """
+    def __init__(self, source_lang='en', pivot_lang='de', target_lang='es'):
+        # Forward (English to German)
+        pivot_forward_model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{pivot_lang}'
+        self.tokenizer_pivot_forward = MarianTokenizer.from_pretrained(pivot_forward_model_name)
+        self.model_pivot_forward = MarianMTModel.from_pretrained(pivot_forward_model_name)
+        # Pivot translation model (German to Spanish)
+        pivot_backward_model_name = f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}'
+        self.tokenizer_pivot_backward = MarianTokenizer.from_pretrained(pivot_backward_model_name)
+        self.model_pivot_backward = MarianMTModel.from_pretrained(pivot_backward_model_name)
+        # Backward (Spanish to English)
+        backward_model_name = f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
+        self.tokenizer_backward = MarianTokenizer.from_pretrained(backward_model_name)
+        self.model_backward = MarianMTModel.from_pretrained(backward_model_name)
+    def back_translate(self, text):
+        """
+        Perform back-translation through German and Spanish to generate text variations.
+        Args:
+            text (str): The input text to be back-translated
+        Returns:
+            str: The back-translated text
+        """
+        # 1. English to German
+        encoded_pivot = self.tokenizer_pivot_forward([text], padding=True, truncation=True, return_tensors='pt')
+        generated_pivot = self.model_pivot_forward.generate(**encoded_pivot)
+        pivot_text = self.tokenizer_pivot_forward.batch_decode(generated_pivot, skip_special_tokens=True)[0]
+        # 2. German to Spanish
+        encoded_back_pivot = self.tokenizer_pivot_backward([pivot_text], padding=True, truncation=True, return_tensors='pt')
+        retranslated_pivot = self.model_pivot_backward.generate(**encoded_back_pivot)
+        tgt_text_back = self.tokenizer_pivot_backward.batch_decode(retranslated_pivot, skip_special_tokens=True)[0]
+        # 3. Spanish to English
+        encoded_back = self.tokenizer_backward([tgt_text_back], padding=True, truncation=True, return_tensors='pt')
+        retranslated = self.model_backward.generate(**encoded_back)
+        src_text = self.tokenizer_backward.batch_decode(retranslated, skip_special_tokens=True)[0]
+        return src_text

dialogue_augmenter.py ADDED Viewed

	@@ -0,0 +1,716 @@

+from typing import Dict, List
+import numpy as np
+import tensorflow as tf
+import tensorflow_hub as hub
+import re
+from pipeline_config import PipelineConfig
+from quality_metrics import QualityMetrics
+from paraphraser import Paraphraser
+from back_translator import BackTranslator
+import nlpaug.augmenter.word as naw
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
+from sklearn.metrics.pairwise import cosine_similarity
+class DialogueAugmenter:
+    """
+    Optimized dialogue augmentation with quality control and complexity management.
+    """
+    def __init__(self, nlp, config: PipelineConfig):
+        self.nlp = nlp
+        self.config = config
+        self.quality_metrics = QualityMetrics(config)
+        self.use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
+        # Advanced augmentation techniques
+        self.paraphraser = Paraphraser()
+        self.back_translator = BackTranslator()
+        # Basic augmentation techniques
+        self.word_augmenter = naw.SynonymAug(aug_src='wordnet')
+        self.spelling_augmenter = naw.SpellingAug()
+        self.augmenters = {
+            'advanced': [self.paraphraser, self.back_translator],
+            'basic': [
+                ('synonym', self.word_augmenter),
+                ('spelling', self.spelling_augmenter)
+            ]
+        }
+        # Initialize cache
+        self.embedding_cache = {}
+        self.perplexity_cache = {}
+        # Compile regex patterns
+        self.spelling_pattern = re.compile(r'[a-zA-Z]{3,}')
+        # GPU memory management
+        gpus = tf.config.list_physical_devices('GPU')
+        if gpus:
+            try:
+                for gpu in gpus:
+                    tf.config.experimental.set_memory_growth(gpu, True)
+            except RuntimeError as e:
+                print(e)
+    @lru_cache(maxsize=1024)
+    def _compute_embedding(self, text: str) -> np.ndarray:
+        """Cached computation of text embedding"""
+        return self.use_model([text])[0].numpy()
+    def _compute_batch_embeddings(self, texts: List[str]) -> np.ndarray:
+        """Compute embeddings for multiple texts at once"""
+        return self.use_model(texts).numpy()
+    def _quick_quality_check(self, variation: str, original: str) -> bool:
+        """
+        Simplified preliminary quality check with minimal standards
+        """
+        if self.config.debug:
+            print(f"\nQuick check for variation: {variation}")
+        # Only reject if length is extremely different
+        orig_len = len(original.split())
+        var_len = len(variation.split())
+        # For very short texts (1-3 words), allow more variation
+        if orig_len <= 3:
+            if var_len > orig_len * 4:  # Allow up to 4x length for short texts
+                if self.config.debug:
+                    print(f"Failed length check (short text): {var_len} vs {orig_len}")
+                return False
+        else:
+            if var_len > orig_len * 3:  # Allow up to 3x length for longer texts
+                if self.config.debug:
+                    print(f"Failed length check (long text): {var_len} vs {orig_len}")
+                return False
+        # Basic content check - at least one word in common (excluding stop words)
+        stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'is', 'are'}
+        orig_words = set(w.lower() for w in original.split() if w.lower() not in stop_words)
+        var_words = set(w.lower() for w in variation.split() if w.lower() not in stop_words)
+        if not orig_words.intersection(var_words):
+            if self.config.debug:
+                print("Failed content check: no content words in common")
+            return False
+        if self.config.debug:
+            print("Passed all quick checks")
+        return True
+    def _compute_metrics_parallel(self, original: str, candidates: List[str]) -> List[Dict[str, float]]:
+        """Compute quality metrics for multiple candidates in parallel"""
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            futures = [
+                executor.submit(self.quality_metrics.compute_metrics, original, candidate)
+                for candidate in candidates
+            ]
+            return [future.result() for future in futures]
+    def _filter_variations_batch(self, variations: List[str], context: List[str], original_turn: str) -> List[str]:
+        """
+        Filter variations using batched computations with detailed logging
+        """
+        if not variations:
+            return []
+        if self.config.debug:
+            print(f"\nStarting filtration of {len(variations)} variations")
+            print(f"Context length: {len(context)}")
+            print(f"Original turn: {original_turn}")
+        words = original_turn.split()
+        if len(words) < 3:
+            if self.config.debug:
+                print("Short text detected, using predefined variations")
+            short_text_variations = self._augment_short_text({'text': original_turn, 'speaker': ''})
+            return [var['text'] for var in short_text_variations]
+        # If this is the first turn (no context), be more lenient
+        if not context:
+            preliminary_filtered = variations
+            if self.config.debug:
+                print("First turn - skipping preliminary filtering")
+        else:
+            # Quick preliminary filtering against original turn
+            preliminary_filtered = []
+            for var in variations:
+                passed = self._quick_quality_check(var, original_turn)
+                if self.config.debug:
+                    print(f"\nVariation: {var}")
+                    print(f"Passed quick check: {passed}")
+                if passed:
+                    preliminary_filtered.append(var)
+        if self.config.debug:
+            print(f"Variations after quick check: {len(preliminary_filtered)}")
+        if not preliminary_filtered:
+            return []
+        # Only use last turn for coherence
+        recent_context = [context[-1]] if context else []
+        context_text = ' '.join(recent_context) if recent_context else ''
+        # Even more lenient thresholds
+        min_similarity = 0.1  # Further reduced
+        min_coherence = 0.05  # Further reduced
+        if context_text:
+            if self.config.debug:
+                print(f"\nContext text: {context_text}")
+            all_texts = [context_text] + preliminary_filtered
+            all_embeddings = self._compute_batch_embeddings(all_texts)
+            context_embedding = all_embeddings[0]
+            variation_embeddings = all_embeddings[1:]
+            # Vectorized similarity computation
+            context_similarities = cosine_similarity([context_embedding], variation_embeddings)[0]
+            # Response coherence check
+            if recent_context:
+                prev_embedding = self._compute_embedding(recent_context[-1])
+                response_coherence = cosine_similarity([prev_embedding], variation_embeddings)[0]
+            else:
+                response_coherence = np.ones_like(context_similarities)
+            # Combined scoring with detailed logging
+            filtered_variations = []
+            for i, (variation, sim, coh) in enumerate(zip(
+                preliminary_filtered, context_similarities, response_coherence)):
+                # Use absolute values for scoring
+                combined_score = (
+                    self.config.context_similarity_weight * abs(sim) +
+                    self.config.response_coherence_weight * abs(coh)
+                )
+                if self.config.debug:
+                    print(f"\nVariation: {variation}")
+                    print(f"Context similarity: {sim:.3f}")
+                    print(f"Response coherence: {coh:.3f}")
+                    print(f"Combined score: {combined_score:.3f}")
+                # Accept if EITHER score is good enough
+                if (combined_score >= min_similarity or abs(coh) >= min_coherence):
+                    filtered_variations.append(variation)
+                    if self.config.debug:
+                        print("ACCEPTED")
+                else:
+                    if self.config.debug:
+                        print("REJECTED")
+                # If we have enough variations, stop
+                if len(filtered_variations) >= self.config.max_variations_per_turn:
+                    break
+        else:
+            filtered_variations = preliminary_filtered[:self.config.max_variations_per_turn]
+        if self.config.debug:
+            print(f"\nFinal filtered variations: {len(filtered_variations)}")
+        return filtered_variations
+    def _generate_variations_progressive(self, text: str, needed: int) -> List[str]:
+        """
+        Generate variations progressively until we have enough good ones
+        """
+        variations = set()
+        if self.config.debug:
+            print(f"\nAttempting to generate {needed} variations for text: {text}")
+        # Try advanced augmenters first
+        for augmenter in self.augmenters['advanced']:
+            if len(variations) >= needed:
+                break
+            try:
+                if isinstance(augmenter, Paraphraser):
+                    if self.config.debug:
+                        print("Trying paraphrase augmentation...")
+                    new_vars = augmenter.paraphrase(text, num_return_sequences=needed-len(variations))
+                    if self.config.debug:
+                        print(f"Paraphraser generated {len(new_vars)} variations")
+                else:
+                    if self.config.debug:
+                        print("Trying back translation...")
+                    new_vars = [augmenter.back_translate(text)]
+                    if self.config.debug:
+                        print(f"Back translator generated {len(new_vars)} variations")
+                valid_vars = [v for v in new_vars if v.strip() and v != text]
+                variations.update(valid_vars)
+                if self.config.debug:
+                    print(f"Current unique variations: {len(variations)}")
+            except Exception as e:
+                print(f"Error in advanced augmentation: {str(e)}")
+                continue
+        # Try basic augmenters if needed
+        if len(variations) < needed:
+            if self.config.debug:
+                print("Not enough variations, trying basic augmenters...")
+            for aug_type, augmenter in self.augmenters['basic']:
+                if len(variations) >= needed:
+                    break
+                try:
+                    if aug_type == 'spelling' and self._is_technical_or_formal_text(text):
+                        if self.config.debug:
+                            print("Skipping spelling augmentation for technical text")
+                        continue
+                    if self.config.debug:
+                        print(f"Trying {aug_type} augmentation...")
+                    new_vars = augmenter.augment(text, n=2)
+                    if isinstance(new_vars, list):
+                        valid_vars = [v for v in new_vars if v.strip() and v != text]
+                        variations.update(valid_vars)
+                    else:
+                        if new_vars.strip() and new_vars != text:
+                            variations.add(new_vars)
+                    if self.config.debug:
+                        print(f"After {aug_type}, total variations: {len(variations)}")
+                except Exception as e:
+                    print(f"Error in {aug_type} augmentation: {str(e)}")
+                    continue
+        variations_list = list(variations)
+        if self.config.debug:
+            print(f"Final number of variations generated: {len(variations_list)}")
+            if not variations_list:
+                print("WARNING: No variations were generated!")
+        return variations_list
+    def augment_dialogue(self, dialogue: Dict) -> List[Dict]:
+        """
+        Create augmented versions of the dialogue with optimized processing
+        """
+        # Early dialogue length check
+        original_length = len(dialogue['turns'])
+        if original_length > self.config.max_turns_per_dialogue:
+            if self.config.debug:
+                print(f"Truncating dialogue from {original_length} to {self.config.max_turns_per_dialogue} turns")
+            dialogue['turns'] = dialogue['turns'][:self.config.max_turns_per_dialogue]
+        turn_variations = []
+        context = []
+        # Process each turn with progressive generation
+        for turn in dialogue['turns']:
+            original_text = turn['text']  # Store original turn text
+            variations = self._generate_variations_progressive(
+                original_text,
+                self.config.max_variations_per_turn
+            )
+            # Batch filter variations with original text
+            filtered_variations = self._filter_variations_batch(
+                variations,
+                context,
+                original_text  # Pass the original turn text
+            )
+            # Create turn variations with speaker info
+            turn_vars = [{'speaker': turn['speaker'], 'text': v} for v in filtered_variations]
+            if self.config.debug:
+                print(f"Turn {len(turn_variations)}: Generated {len(turn_vars)} variations")
+            turn_variations.append(turn_vars)
+            context.append(original_text)
+        # Generate combinations with sampling
+        augmented_dialogues = self._generate_dialogue_combinations(
+            dialogue['dialogue_id'],
+            turn_variations
+        )
+        # Add original dialogue
+        result = [{
+            'dialogue_id': f"{dialogue['dialogue_id']}_original",
+            'turns': dialogue['turns']
+        }]
+        # Add unique augmentations
+        result.extend(augmented_dialogues[:self.config.augmentation_factor])
+        if self.config.debug:
+            print(f"Generated {len(result)-1} unique augmented dialogues")
+        return result
+    def _generate_dialogue_combinations(self, dialogue_id: str, turn_variations: List[List[Dict]]) -> List[Dict]:
+        """
+        Generate dialogue combinations using sampling
+        """
+        augmented_dialogues = []
+        used_combinations = set()
+        def generate_dialogues(current_turns=None, turn_index=0):
+            if current_turns is None:
+                current_turns = []
+            if len(augmented_dialogues) >= self.config.augmentation_factor:
+                return
+            if turn_index == len(turn_variations):
+                dialogue_fingerprint = " | ".join(turn['text'] for turn in current_turns)
+                if dialogue_fingerprint not in used_combinations:
+                    used_combinations.add(dialogue_fingerprint)
+                    augmented_dialogues.append({
+                        'dialogue_id': f"{dialogue_id}_aug_{len(augmented_dialogues)}",
+                        'turns': current_turns.copy()
+                    })
+                return
+            variations = list(turn_variations[turn_index])
+            np.random.shuffle(variations)
+            for variation in variations[:self.config.max_sampled_variations]:
+                if len(augmented_dialogues) >= self.config.augmentation_factor:
+                    return
+                current_turns.append(variation)
+                generate_dialogues(current_turns, turn_index + 1)
+                current_turns.pop()
+        try:
+            generate_dialogues()
+        except Exception as e:
+            print(f"Error in dialogue generation: {str(e)}")
+            return []
+        return augmented_dialogues
+    def _is_dialogue_duplicate(self, dialogue1: Dict, dialogue2: Dict) -> bool:
+        """
+        Check if two dialogues are duplicates.
+        """
+        text1 = " ".join(turn['text'] for turn in dialogue1['turns'])
+        text2 = " ".join(turn['text'] for turn in dialogue2['turns'])
+        return text1 == text2
+    # def _augment_turn(self, turn: Dict, context: List[str]) -> List[Dict]:
+    #     """
+    #     Generate augmented versions of the turn using multiple strategies.
+    #     """
+    #     text = turn['text']
+    #     words = text.split()
+    #     # Special handling for very short texts
+    #     if len(words) < 3:
+    #         return self._augment_short_text(turn)
+    #     all_variations = set()
+    #     # Advanced augmentations (paraphrase and back-translation)
+    #     for augmenter in self.augmenters['advanced']:
+    #         try:
+    #             if isinstance(augmenter, Paraphraser):
+    #                 variations = augmenter.paraphrase(text)
+    #                 all_variations.update(variations)
+    #             elif isinstance(augmenter, BackTranslator):
+    #                 aug_text = augmenter.back_translate(text)
+    #                 if aug_text:
+    #                     all_variations.add(aug_text)
+    #         except Exception as e:
+    #             print(f"Error in advanced augmentation: {str(e)}")
+    #             continue
+    #     # Basic nlpaug augmentations
+    #     for aug_type, augmenter in self.augmenters['basic']:
+    #         try:
+    #             if aug_type == 'spelling' and self._is_technical_or_formal_text(text):
+    #                 continue
+    #             aug_texts = augmenter.augment(text, n=2)
+    #             if isinstance(aug_texts, list):
+    #                 all_variations.update(aug_texts)
+    #             else:
+    #                 all_variations.add(aug_texts)
+    #         except Exception as e:
+    #             print(f"Error in {aug_type} augmentation: {str(e)}")
+    #             continue
+    #     # Remove exact duplicates and empty strings
+    #     augmented_texts = [t for t in list(all_variations) if t.strip()]
+    #     # Apply context filtering
+    #     if context:
+    #         augmented_texts = self._filter_by_context(augmented_texts, context)
+    #         print(f"After context filtering: {len(augmented_texts)} variations")
+    #     # Select best variations
+    #     best_variations = self._select_best_augmentations(
+    #         text,
+    #         augmented_texts,
+    #         num_to_select=self.config.augmentation_factor,
+    #         min_quality_score=0.7
+    #     )
+    #     # Create variations with speaker info
+    #     variations = [{'speaker': turn['speaker'], 'text': text} for text in best_variations]
+    #     return variations
+    def _augment_short_text(self, turn: Dict) -> List[Dict]:
+        """
+        Special handling for very short texts with predefined variations.
+        Args:
+            turn (Dict): Original dialogue turn
+        Returns:
+            List[Dict]: List of variations for the short text
+        """
+        text = turn['text']
+        common_variations = {
+            'goodbye': [
+                'Bye!', 'Farewell!', 'See you!', 'Take care!',
+                'Goodbye!', 'Bye for now!', 'Until next time!'
+            ],
+            'hello': [
+                'Hi!', 'Hey!', 'Hello!', 'Greetings!',
+                'Good day!', 'Hi there!', 'Hello there!'
+            ],
+            'yes': [
+                'Yes!', 'Correct!', 'Indeed!', 'Absolutely!',
+                'That\'s right!', 'Definitely!', 'Sure!'
+            ],
+            'no': [
+                'No!', 'Nope!', 'Not at all!', 'Negative!',
+                'Unfortunately not!', 'I\'m afraid not!'
+            ],
+            'thanks': [
+                'Thank you!', 'Thanks a lot!', 'Many thanks!',
+                'I appreciate it!', 'Thank you so much!'
+            ],
+            'ok': [
+                'Okay!', 'Alright!', 'Sure!', 'Got it!',
+                'Understood!', 'Fine!', 'Great!', 'Perfect!',
+                'That works!', 'Sounds good!'
+            ],
+            'good': [
+                'Great!', 'Excellent!', 'Perfect!', 'Wonderful!',
+                'Fantastic!', 'Amazing!', 'Terrific!'
+            ]
+        }
+        # Try to find matching variations
+        text_lower = text.lower().rstrip('!.,?')
+        variations = []
+        # Check if text matches any of our predefined categories
+        for key, predefined_vars in common_variations.items():
+            if key in text_lower or text_lower in key:
+                variations.extend(predefined_vars)
+        # If no predefined variations found, generate simple variants
+        if not variations:
+            # Add punctuation variations
+            variations = [
+                text.rstrip('!.,?') + '!',
+                text.rstrip('!.,?') + '.',
+                text.rstrip('!.,?')
+            ]
+            # Add capitalization variations
+            variations.extend([
+                v.capitalize() for v in variations
+                if v.capitalize() not in variations
+            ])
+        # Filter variations for uniqueness and quality
+        unique_variations = list(set(variations))
+        quality_variations = []
+        for var in unique_variations:
+            metrics = self.quality_metrics.compute_metrics(text, var)
+            quality_score = (
+                0.35 * metrics['semantic_similarity'] +
+                0.30 * (1.0 - metrics['perplexity'] / 100) +
+                0.15 * (1.0 - metrics['grammar_errors'] / 10) +
+                0.15 * metrics['content_preservation'] +
+                0.10 * metrics['type_token_ratio']
+            )
+            # More lenient quality threshold for short texts
+            if quality_score >= 0.5:  # Lower threshold for short texts
+                quality_variations.append(var)
+        # Ensure we have at least some variations
+        if not quality_variations:
+            quality_variations = [text]
+        # Return the variations with original speaker
+        return [{'speaker': turn['speaker'], 'text': v} for v in quality_variations[:self.config.augmentation_factor]]
+    def _is_technical_or_formal_text(self, text: str) -> bool:
+        """
+        Check if text is formal/technical and shouldn't have spelling variations.
+        """
+        formal_indicators = {
+            'technical_terms': {'api', 'config', 'database', 'server', 'system'},
+            'formal_phrases': {'please advise', 'regarding', 'furthermore', 'moreover'},
+            'professional_context': {'meeting', 'conference', 'project', 'deadline'}
+        }
+        text_lower = text.lower()
+        words = set(text_lower.split())
+        for category in formal_indicators.values():
+            if words.intersection(category):
+                return True
+        return False
+    # def _filter_by_context(self, variations: List[str], context: List[str]) -> List[str]:
+    #     """
+    #     Filter variations based on conversation context using config parameters.
+    #     """
+    #     # Manage context window using config
+    #     recent_context = context[-self.config.context_window_size:] if len(context) > self.config.context_window_size else context
+    #     filtered_variations = []
+    #     context_embedding = self.use_model([' '.join(recent_context)])[0].numpy()
+    #     prev_turn = recent_context[-1] if recent_context else ''
+    #     for variation in variations:
+    #         var_embedding = self.use_model([variation])[0].numpy()
+    #         # Overall context similarity
+    #         context_similarity = cosine_similarity([context_embedding], [var_embedding])[0][0]
+    #         # Direct response coherence
+    #         response_coherence = 1.0
+    #         if prev_turn:
+    #             prev_embedding = self.use_model([prev_turn])[0].numpy()
+    #             response_coherence = cosine_similarity([prev_embedding], [var_embedding])[0][0]
+    #         # Use weights from config
+    #         combined_similarity = (
+    #             self.config.context_similarity_weight * context_similarity +
+    #             self.config.response_coherence_weight * response_coherence
+    #         )
+    #         if (combined_similarity >= self.config.semantic_similarity_threshold and
+    #             response_coherence >= self.config.min_response_coherence):
+    #             filtered_variations.append(variation)
+    #             if self.config.debug:
+    #                 print(f"Accepted variation: {variation}")
+    #                 print(f"Context similarity: {context_similarity:.3f}")
+    #                 print(f"Response coherence: {response_coherence:.3f}")
+    #                 print(f"Combined score: {combined_similarity:.3f}\n")
+    #         else:
+    #             if self.config.debug:
+    #                 print(f"Rejected variation: {variation}")
+    #                 print(f"Combined score {combined_similarity:.3f} below threshold "
+    #                     f"{self.config.semantic_similarity_threshold}")
+    #                 print(f"Response coherence {response_coherence:.3f} below threshold "
+    #                     f"{self.config.min_response_coherence}\n")
+    #     return filtered_variations or variations  # Fallback to original
+    # def _select_best_augmentations(self, original: str, candidates: List[str], used_variations: set = None,
+    #                               num_to_select: int = 3, min_quality_score: float = 0.7) -> List[str]:
+    #     """
+    #     Select the best augmentations using a quality score.
+    #     Args:
+    #         original (str): The original text
+    #         candidates (List[str]): List of candidate augmented texts
+    #         used_variations (set): Set of already used variations
+    #         num_to_select (int): Number of variations to select
+    #         min_quality_score (float): Minimum quality score threshold
+    #     """
+    #     if used_variations is None:
+    #         used_variations = set()
+    #     candidates = [c for c in candidates if c.strip()]
+    #     # Skip short text
+    #     if len(original.split()) < 3:
+    #         print(f"Text too short for augmentation: {original}")
+    #         return [original]
+    #     scored_candidates = []
+    #     for candidate in candidates:
+    #         if candidate in used_variations:
+    #             continue
+    #         metrics = self.quality_metrics.compute_metrics(original, candidate)
+    #         # Add contextual penalty for inappropriate audience terms
+    #         audience_terms = {'everyone', 'everybody', 'folks', 'all', 'guys', 'people'}
+    #         has_audience_term = any(term in candidate.lower() for term in audience_terms)
+    #         audience_penalty = 0.2 if has_audience_term else 0.0
+    #         # Weighted quality score
+    #         quality_score = (
+    #             0.40 * metrics['semantic_similarity'] +          # Semantic preservation
+    #             0.25 * (1.0 - metrics['perplexity'] / 100) +     # Fluency
+    #             0.15 * (1.0 - metrics['grammar_errors'] / 10) +  # Grammar
+    #             0.15 * metrics['content_preservation'] +         # Content preservation
+    #             0.05 * metrics['type_token_ratio']               # Lexical diversity
+    #         )
+    #         quality_score -= audience_penalty
+    #         if (metrics['semantic_similarity'] < 0.5 or     # Reject on semantic threshold miss
+    #             metrics['rouge1_f1'] < 0.2):                # Enforce minimum lexical overlap
+    #             continue
+    #         # Bonus points for:
+    #         # Length similarity to original
+    #         if 0.75 <= metrics['length_ratio'] <= 1.25:
+    #             quality_score += 0.05
+    #         # Correct grammar
+    #         if metrics['grammar_errors'] == 0:
+    #             quality_score += 0.025
+    #         print(f"Candidate: {candidate}")
+    #         print(f"Quality score: {quality_score:.2f}, Metrics: {metrics}")
+    #         # Consider the augmentationif meets basic quality threshold
+    #         if quality_score >= min_quality_score:
+    #             print('Candidate accepted\n')
+    #             scored_candidates.append((candidate, quality_score, metrics))
+    #         else:
+    #             print('Candidate rejected\n')
+    #     # Sort by quality score with small random factor for diversity
+    #     scored_candidates.sort(key=lambda x: x[1], reverse=True)
+    #     selected = []
+    #     for candidate, score, metrics in scored_candidates:
+    #         # Check diversity against already selected
+    #         if len(selected) == 0:
+    #             selected.append(candidate)
+    #             continue
+    #         # Compute average similarity to already selected
+    #         avg_similarity = np.mean([
+    #             self.quality_metrics.compute_semantic_similarity(candidate, prev)
+    #             for prev in selected
+    #         ])
+    #         # Add if sufficiently different (similarity < 0.98)
+    #         if avg_similarity < 0.98:
+    #             selected.append(candidate)
+    #         if len(selected) >= num_to_select:
+    #             break
+    #     return selected

main.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+CSC525 - Module 8 Option 2 - Joseph Armani
+Description and References in the README.md file.
+"""
+import json
+import tensorflow as tf
+from typing import List, Dict
+from pipeline_config import PipelineConfig
+from processing_pipeline import ProcessingPipeline
+from taskmaster_processor import TaskmasterProcessor
+from schema_guided_dialogue_processor import SchemaGuidedProcessor
+def combine_datasets(taskmaster_dialogues: List[Dict],
+                    schema_guided_dialogues: List[Dict]) -> List[Dict]:
+    """
+    Combine dialogues from both datasets into a single list
+    Args:
+        taskmaster_dialogues: List of dialogues in pipeline format from Taskmaster
+        schema_guided_dialogues: List of dialogues in pipeline format from Schema-Guided
+    Returns:
+        List[Dict]: Combined list of dialogues
+    """
+    # Ensure unique dialogue IDs
+    combined_dialogues = []
+    seen_ids = set()
+    duplicate_count = 0  # Track duplicates for reporting
+    for dialogue in taskmaster_dialogues:
+        dialogue_copy = dialogue.copy()
+        dialogue_id = dialogue_copy['dialogue_id']
+        if dialogue_id in seen_ids:
+            duplicate_count += 1
+            dialogue_id = f"taskmaster_{dialogue_id}"
+        seen_ids.add(dialogue_id)
+        dialogue_copy['dialogue_id'] = dialogue_id
+        combined_dialogues.append(dialogue_copy)
+    for dialogue in schema_guided_dialogues:
+        dialogue_copy = dialogue.copy()
+        dialogue_id = dialogue_copy['dialogue_id']
+        if dialogue_id in seen_ids:
+            duplicate_count += 1
+            dialogue_id = f"schema_guided_{dialogue_id}"
+        seen_ids.add(dialogue_id)
+        dialogue_copy['dialogue_id'] = dialogue_id
+        combined_dialogues.append(dialogue_copy)
+    # Log the results
+    print(f"Combine Datasets: Found and resolved {duplicate_count} duplicate dialogue IDs.")
+    print(f"Combine Datasets: Total dialogues combined: {len(combined_dialogues)}")
+    return combined_dialogues
+def main():
+    # Configuration
+    config = PipelineConfig(
+        min_length=1,
+        max_length=512,
+        batch_size=32 if tf.config.list_physical_devices('GPU') else 16,
+        max_turns_per_dialogue=6,
+        max_variations_per_turn=3,
+        max_sampled_variations=2,
+        context_window_size=4,
+        max_complexity_threshold=100,
+        use_cache=False,
+        debug=True,
+        allowed_speakers=['user', 'assistant'],
+        required_fields=['dialogue_id', 'turns']
+    )
+    try:
+        # Set max_examples (Optional[int]) for testing
+        max_examples = None
+        # Initialize and load Taskmaster dataset
+        print("Loading Taskmaster dataset")
+        taskmaster_processor = TaskmasterProcessor(config, use_ontology=False)
+        taskmaster_dir = './datasets/taskmaster'
+        taskmaster_dialogues = taskmaster_processor.load_dataset(taskmaster_dir, max_examples=max_examples)
+        taskmaster_pipeline_dialogues = taskmaster_processor.convert_to_pipeline_format(taskmaster_dialogues)
+        print(f"Processed Taskmaster dialogues: {len(taskmaster_pipeline_dialogues)}")
+        # Initialize and load Schema-Guided dataset
+        print("Loading Schema-Guided dataset")
+        schema_dialogue_processor = SchemaGuidedProcessor(config)
+        schema_guided_dir = './datasets/schema_guided'
+        schema_dialogues = schema_dialogue_processor.load_dataset(schema_guided_dir, max_examples=max_examples)
+        schema_pipeline_dialogues = schema_dialogue_processor.convert_to_pipeline_format(schema_dialogues)
+        print(f"Processed Schema-Guided dialogues: {len(schema_pipeline_dialogues)}")
+        # Combine datasets
+        print("Combining datasets")
+        combined_dialogues = combine_datasets(taskmaster_pipeline_dialogues, schema_pipeline_dialogues)
+        print(f"Combined Dialogues: {len(combined_dialogues)}")
+        if not combined_dialogues:
+            print("Combined dialogues are empty. Exiting.")
+            return
+        # Process through augmentation pipeline
+        print("Processing combined dataset")
+        pipeline = ProcessingPipeline(config)
+        processed_dialogues = pipeline.process_dataset(combined_dialogues)
+        # Save results
+        output_path = 'augmented_combined_dataset.json'
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(processed_dialogues, f, indent=2, ensure_ascii=False)
+        # Print statistics
+        print(f"\nProcessed Statistics:")
+        print(f"Total dialogues: {len(processed_dialogues)}")
+        print(f"Taskmaster domains: {len(taskmaster_processor.domains)}")
+        print(f"Schema-Guided services: {len(schema_dialogue_processor.services)}")
+        print(f"Schema-Guided domains: {len(schema_dialogue_processor.domains)}")
+    except Exception as e:
+        print(f"Processing failed: {str(e)}")
+        raise
+if __name__ == "__main__":
+    main()

paraphraser.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+)
+class Paraphraser:
+    def __init__(self, model_name='humarin/chatgpt_paraphraser_on_T5_base'):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        self.model.eval()
+    def paraphrase(self, text, num_return_sequences=5, num_beams=10, num_beam_groups=5, diversity_penalty=0.8):
+        try:
+            input_text = "paraphrase: " + text + " </s>"
+            encoding = self.tokenizer.encode_plus(input_text, return_tensors="pt")
+            input_ids = encoding["input_ids"]
+            outputs = self.model.generate(
+                input_ids=input_ids,
+                max_length=256,
+                num_beams=num_beams,
+                num_beam_groups=num_beam_groups,
+                num_return_sequences=num_return_sequences,
+                diversity_penalty=diversity_penalty,
+                early_stopping=True
+            )
+            paraphrases = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
+            return paraphrases
+        except Exception as e:
+            print(f"Error in paraphrasing: {e}")
+            return []

pipeline_config.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from dataclasses import dataclass
+from typing import List
+@dataclass
+class PipelineConfig:
+    """
+    Config for the pipeline
+    """
+    # Validation settings
+    min_length: int = 1
+    max_length: int = 512
+    min_tokens: int = 1
+    max_tokens: int = 128
+    allowed_speakers: List[str] = None
+    required_fields: List[str] = None
+    # Text augmentation settings
+    augmentation_factor: int = 4
+    augmentation_techniques: List[str] = None
+    max_turns_per_dialogue: int = 6
+    max_variations_per_turn: int = 3
+    max_sampled_variations: int = 2
+    max_complexity_threshold: int = 100
+    complexity_reduction_turns: int = 4
+    # Quality thresholds
+    semantic_similarity_threshold: float = 0.45
+    grammar_error_threshold: int = 2
+    rouge1_f1_threshold: float = 0.30
+    rouge2_f1_threshold: float = 0.15
+    perplexity_threshold: float = 50.0
+    # Response coherence thresholds
+    min_response_coherence: float = 0.3
+    context_similarity_weight: float = 0.35
+    response_coherence_weight: float = 0.65
+    # Performance settings
+    batch_size: int = 32
+    use_cache: bool = True
+    debug: bool = False
+    context_window_size: int = 4
+    def __post_init__(self):
+        if self.allowed_speakers is None:
+            self.allowed_speakers = ['user', 'assistant']
+        if self.required_fields is None:
+            self.required_fields = ['dialogue_id', 'turns']
+        if self.augmentation_techniques is None:
+            self.augmentation_techniques = ['paraphrase', 'back_translation']
+        # Validate weights sum to 1.0
+        if abs((self.context_similarity_weight + self.response_coherence_weight) - 1.0) > 1e-6:
+            raise ValueError("Context similarity and response coherence weights must sum to 1.0")

processing_pipeline.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Optional
+import json
+import re
+import hashlib
+import pickle
+import spacy
+from tqdm import tqdm
+from pipeline_config import PipelineConfig
+from dialogue_augmenter import DialogueAugmenter
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+class ProcessingPipeline:
+    """
+    Complete pipeline combining validation, optimization, and augmentation.
+    """
+    def __init__(self, config: Optional[PipelineConfig] = None):
+        self.config = config or PipelineConfig()
+        self.nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
+        self.augmenter = DialogueAugmenter(self.nlp, self.config)
+        self.num_threads = self.config.batch_size
+        self.cache_dir = Path("./cache")
+        self.cache_dir.mkdir(exist_ok=True)
+    def process_dataset(self, dialogues: List[Dict]) -> List[Dict]:
+        """
+        Process entire dataset through the pipeline.
+        """
+        print(f"Processing {len(dialogues)} dialogues")
+        start_time = datetime.now()
+        # Check cache
+        if self.config.use_cache:
+            cache_path = self._get_cache_path(dialogues)
+            if cache_path.exists():
+                print("Loading from cache...")
+                with open(cache_path, 'rb') as f:
+                    return pickle.load(f)
+        # Validate and clean
+        valid_dialogues = self._process_validation(
+            dialogues,
+            self._validate_and_clean_dialogue,
+            "validating and cleaning"
+        )
+        if not valid_dialogues:
+            raise ValueError("Dialogue validation resulted in an empty dataset.")
+        deduplicated_dialogues = self._deduplicate_dialogues(valid_dialogues)
+        # Augment dialogues
+        all_processed_dialogues = []
+        for dialogue in deduplicated_dialogues:
+            augmented = self.augmenter.augment_dialogue(dialogue)
+            all_processed_dialogues.extend(augmented)
+        # Save to cache
+        if self.config.use_cache:
+            with open(cache_path, 'wb') as f:
+                pickle.dump(all_processed_dialogues, f)
+        processing_time = datetime.now() - start_time
+        print(f"Processing completed in {processing_time}")
+        print(f"Generated {len(all_processed_dialogues)} total dialogues")
+        return all_processed_dialogues
+    def _deduplicate_dialogues(self, dialogues: List[Dict], threshold: float = 0.9) -> List[Dict]:
+        """
+        Deduplicate dialogues based on text similarity.
+        """
+        print("Deduplicating dialogues...")
+        if not dialogues:
+            print("No dialogues provided for deduplication.")
+            return []
+        # Combine turns into single text for similarity comparison
+        texts = [" ".join(turn['text'] for turn in dialogue['turns']) for dialogue in dialogues]
+        tfidf = TfidfVectorizer().fit_transform(texts)
+        sim_matrix = cosine_similarity(tfidf)
+        unique_indices = set()
+        for i, row in enumerate(sim_matrix):
+            if i not in unique_indices:
+                similar_indices = [j for j, sim in enumerate(row) if sim > threshold and j != i]
+                unique_indices.add(i)
+                unique_indices.difference_update(similar_indices)
+        deduplicated_dialogues = [dialogues[i] for i in unique_indices]
+        print(f"Deduplication complete. Reduced from {len(dialogues)} to {len(deduplicated_dialogues)} dialogues.")
+        return deduplicated_dialogues
+    def _validate_and_clean_dialogue(self, dialogue: Dict) -> Optional[Dict]:
+        """
+        Validate and clean a single dialogue.
+        """
+        try:
+            # Check required fields
+            if not all(field in dialogue for field in self.config.required_fields):
+                return None
+            # Process turns
+            cleaned_turns = []
+            for turn in dialogue['turns']:
+                if self._validate_turn(turn):
+                    cleaned_turn = {
+                        'speaker': turn['speaker'],
+                        'text': self._clean_text(turn['text'])
+                    }
+                    cleaned_turns.append(cleaned_turn)
+            if cleaned_turns:
+                return {
+                    'dialogue_id': dialogue['dialogue_id'],
+                    'turns': cleaned_turns
+                }
+            return None
+        except Exception as e:
+            print(f"Error processing dialogue {dialogue.get('dialogue_id', 'unknown')}: {str(e)}")
+            return None
+    def _validate_turn(self, turn: Dict) -> bool:
+        """
+        Validate a single speaking turn.
+        """
+        return (
+            turn['speaker'] in self.config.allowed_speakers and
+            self.config.min_length <= len(turn['text']) <= self.config.max_length
+        )
+    def _clean_text(self, text: str) -> str:
+        """
+        Clean and normalize text.
+        """
+        # Remove excessive whitespace
+        text = re.sub(r'\s+', ' ', text.strip())
+        # Normalize quotes and apostrophes
+        text = re.sub(r'[’´`]', "'", text)
+        text = re.sub(r'[“”]', '"', text)
+        # Remove control characters
+        text = "".join(char for char in text if ord(char) >= 32 or char == '\n')
+        return text
+    def _process_validation(self, items: List, func, description: str) -> List:
+        """
+        Process items sequentially with a progress bar.
+        """
+        results = []
+        print(f"Starting {description}")
+        for item in tqdm(items, desc=description):
+            try:
+                result = func(item)
+                if result is not None:
+                    results.append(result)
+            except Exception as e:
+                print(f"Error processing item: {str(e)}")
+        print(f"Completed {description}. Processed {len(results)} items successfully")
+        return results
+    def _get_cache_path(self, data: List[Dict]) -> Path:
+        """
+        Generate cache file path based on data hash.
+        """
+        data_str = json.dumps(data, sort_keys=True)
+        hash_value = hashlib.md5(data_str.encode()).hexdigest()
+        return self.cache_dir / f"cache_{hash_value}.pkl"

quality_metrics.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+import tensorflow as tf
+import tensorflow_hub as hub
+from transformers import GPT2TokenizerFast, GPT2LMHeadModel
+import language_tool_python
+from rouge_score import rouge_scorer
+import spacy
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+from typing import Dict
+from pipeline_config import PipelineConfig
+class QualityMetrics:
+    """
+    Measure augmented text quality
+    """
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        # Semantic similarity
+        self.use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
+        # Fluency metrics
+        self.tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
+        self.model = GPT2LMHeadModel.from_pretrained('gpt2')
+        self.model.eval()
+        # Grammar
+        self.language_tool = language_tool_python.LanguageTool('en-US')
+        # Lexical similarity
+        self.rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
+        # Diversity
+        self.nlp = spacy.load('en_core_web_sm')
+    def compute_perplexity(self, text):
+        try:
+            encodings = self.tokenizer(text, return_tensors='pt')
+            input_ids = encodings['input_ids']
+            with torch.no_grad():
+                outputs = self.model(input_ids, labels=input_ids)
+                loss = outputs.loss
+                perplexity = torch.exp(loss)
+                return perplexity.item()
+        except Exception as e:
+            print(f"Error computing perplexity for text '{text}': {e}")
+            return float('inf')  # High perplexity value == poor quality
+    def compute_semantic_similarity(self, text1: str, text2: str) -> float:
+        """
+        Compute semantic similarity between two texts using the Universal Sentence Encoder.
+        Args:
+            text1 (str): First text
+            text2 (str): Second text
+        Returns:
+            float: Cosine similarity score between the two texts (0-1)
+        """
+        embeddings = self.use_model([text1, text2])
+        emb1, emb2 = embeddings[0].numpy(), embeddings[1].numpy()
+        return cosine_similarity([emb1], [emb2])[0][0]
+    def compute_metrics(self, original: str, augmented: str) -> Dict[str, float]:
+        """
+        Compute quality metrics
+        """
+        metrics = {}
+        # 1. Semantic Preservation
+        embeddings = self.use_model([original, augmented])
+        emb_orig, emb_aug = embeddings[0].numpy(), embeddings[1].numpy()
+        metrics['semantic_similarity'] = cosine_similarity([emb_orig], [emb_aug])[0][0]
+        # 2. Fluency & Naturalness
+        metrics['perplexity'] = self.compute_perplexity(augmented)
+        metrics['grammar_errors'] = len(self.language_tool.check(augmented))
+        # 3. Lexical Diversity
+        doc_orig = self.nlp(original)
+        doc_aug = self.nlp(augmented)
+        # Type-token ratio with safety check
+        aug_tokens = [token.text.lower() for token in doc_aug]
+        metrics['type_token_ratio'] = len(set(aug_tokens)) / max(len(aug_tokens), 1)
+        # Content word overlap with safety checks
+        orig_content = set([token.text.lower() for token in doc_orig if not token.is_stop])
+        aug_content = set([token.text.lower() for token in doc_aug if not token.is_stop])
+        # Safety check for empty content sets
+        if len(orig_content) == 0:
+            metrics['content_preservation'] = 1.0 if len(aug_content) == 0 else 0.0
+        else:
+            metrics['content_preservation'] = len(orig_content.intersection(aug_content)) / len(orig_content)
+        # 4. Structural Preservation
+        rouge_scores = self.rouge.score(original, augmented)
+        metrics['rouge1_f1'] = rouge_scores['rouge1'].fmeasure
+        metrics['rouge2_f1'] = rouge_scores['rouge2'].fmeasure
+        metrics['rougeL_f1'] = rouge_scores['rougeL'].fmeasure
+        # 5. Length Preservation with safety check
+        orig_words = len(original.split())
+        aug_words = len(augmented.split())
+        metrics['length_ratio'] = aug_words / max(orig_words, 1)
+        return metrics
+    def meets_quality_threshold(self, metrics: Dict[str, float]) -> bool:
+        """
+        Enhanced quality threshold checking
+        """
+        # Core quality checks
+        basic_quality = (
+            metrics['perplexity'] <= self.config.perplexity_threshold and
+            metrics['semantic_similarity'] >= self.config.semantic_similarity_threshold and
+            metrics['grammar_errors'] <= self.config.grammar_error_threshold
+        )
+        # Length preservation check
+        length_ok = 0.6 <= metrics['length_ratio'] <= 1.4
+        # Diversity check
+        diversity_ok = metrics['type_token_ratio'] >= 0.4
+        # Content preservation check
+        content_ok = metrics['content_preservation'] >= 0.6
+        return all([basic_quality, length_ok, diversity_ok, content_ok])

readme.md ADDED Viewed

	@@ -0,0 +1,43 @@

+# Retrieval-based learning chatbot
+CSC525 - Module 8 Option 2 - Retrieval-based Learning Chatbot - Joseph Armani
+## TODO
+A Python tool to generate high-quality dialog variations.
+This package automatically downloads the following models during installation:
+- Universal Sentence Encoder v4 (TensorFlow Hub)
+- ChatGPT Paraphraser T5-base
+- Helsinki-NLP translation models (en-de, de-es, es-en)
+- GPT-2 (for perplexity scoring)
+- spaCy en_core_web_sm
+- nltk wordnet and averaged_perceptron_tagger_eng models
+## Install package
+pip install -e .
+## Description
+This Python script demonstrates a complete pipeline for dialogue augmentation, including validation, optimization, and data augmentation.
+It creates high-quality augmented versions of dialogues by applying various text augmentation techniques and quality control checks.
+Two approaches are used for text augmentation: paraphrasing and back-translation. The pipeline also includes quality metrics for evaluating the augmented text.
+Special handling is implemented for very short text such as greetings and farewells, which are predefined and filtered for quality.
+The pipeline is designed to process a dataset of dialogues and generate multiple high-quality augmented versions of each dialogue.
+The pipeline ensures duplicate dialogues are not generated and that the output meets quality thresholds for semantic similarity, grammar, fluency, diversity, and content preservation.
+## References
+Accsany, P. (2024). Working with JSON data in Python. Real Python. <https://realpython.com/python-json/>
+Explosion AI Team. (n.d.). Spacy · industrial-strength natural language processing in python. <https://spacy.io/>
+GeeksforGeeks. (2024). Text augmentation techniques in NLP. GeeksforGeeks. <https://www.geeksforgeeks.org/text-augmentation-techniques-in-nlp/>
+Helsinki-NLP. (2024). Opus-MT [Computer software]. GitHub. <https://github.com/Helsinki-NLP/Opus-MT>
+Hugging Face. (n.d.). Transformers. Hugging Face. <https://huggingface.co/docs/transformers/en/index>
+Humarin. (2023). ChatGPT paraphraser on T5-base [Computer software]. Hugging Face. <https://huggingface.co/humarin/chatgpt_paraphraser_on_T5_base>
+Keita, Z. (2022). Data augmentation in NLP using back-translation with MarianMT. Towards Data Science. <https://towardsdatascience.com/data-augmentation-in-nlp-using-back-translation-with-marianmt-a8939dfea50a>
+Memgraph. (2023). Cosine similarity in Python with scikit-learn. Memgraph. <https://memgraph.com/blog/cosine-similarity-python-scikit-learn>
+Morris, J. (n.d.). language-tool-python (Version 2.8.1) [Computer software]. PyPI. <https://pypi.org/project/language-tool-python/>
+TensorFlow. (n.d.). Universal sentence encoder. TensorFlow Hub. <https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder>
+Waheed, A. (2023). How to calculate ROUGE score in Python. Python Code. <https://thepythoncode.com/article/calculate-rouge-score-in-python>

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+spacy>=3.0.0  # Text processing and tokenization
+numpy>=1.19.0  # General numerical computation
+tqdm>=4.64.0  # Progress bar
+torch>=1.10.0  # PyTorch, for deep learning
+tensorflow>=2.6.0  # TensorFlow, for deep learning
+tensorflow-hub>=0.12.0  # Pretrained model hub for TensorFlow
+transformers>=4.21.0  # Hugging Face Transformers library
+rouge-score>=0.1.2  # ROUGE metric for evaluation
+language-tool-python>=2.7.1  # Grammar checking and text correction
+scikit-learn>=1.0.0  # Machine learning tools
+nlpaug>=1.1.0  # Data augmentation for NLP
+nltk>=3.6.0  # Natural language toolkit

schema_guided_dialogue_processor.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from dataclasses import dataclass, field
+from typing import List, Dict, Optional, Any
+import json
+import glob
+from pathlib import Path
+from pipeline_config import PipelineConfig
+@dataclass
+class SchemaGuidedDialogue:
+    """
+    Structured representation of a Schema-Guided dialogue
+    """
+    dialogue_id: str
+    service_name: str
+    service_description: Optional[str]
+    schema: Dict[str, Any]
+    turns: List[Dict[str, Any]]
+    original_metadata: Dict[str, Any] = field(default_factory=dict)
+class SchemaGuidedProcessor:
+    """
+    Handles processing and preparation of Schema-Guided dataset dialogues
+    """
+    def __init__(self, config: PipelineConfig):
+        self.config = config
+        self.services = set()
+        self.domains = set()
+        self.schemas = {}
+    def load_dataset(self, base_dir, max_examples: Optional[int] = None) -> List[SchemaGuidedDialogue]:
+        """
+        Load and parse Schema-Guided Dialogue dataset
+        Args:
+            dialogue_path: Path to the dialogue JSON file
+            schema_path: Path to the schema JSON file
+        """
+        # Define schema and dialogue file patterns
+        schema_file = Path(base_dir, "schema.json")
+        dialogue_files_pattern = str(Path(base_dir, "dialogues_*.json"))
+        # Check for schema file
+        if not schema_file.exists():
+            raise FileNotFoundError(f"Schema file not found at {schema_file}")
+        # Load schema
+        self.schemas = self._load_schemas(schema_file)
+        # Find and validate dialogue files
+        dialogue_files = glob.glob(dialogue_files_pattern)
+        if not dialogue_files:
+            raise FileNotFoundError(f"No dialogue files found matching pattern {dialogue_files_pattern}")
+        print(f"Found {len(dialogue_files)} dialogue files to process.")
+        # Process all dialogues
+        processed_dialogues = []
+        for file_path in dialogue_files:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                raw_dialogues = json.load(f)
+            for dialogue in raw_dialogues:
+                processed_dialogues.append(self._process_single_dialogue(dialogue))
+                if max_examples and len(processed_dialogues) >= max_examples:
+                    break
+        return processed_dialogues
+    def _process_single_dialogue(self, dialogue: Dict[str, Any]) -> SchemaGuidedDialogue:
+        """
+        Process a single dialogue JSON object into a SchemaGuidedDialogue object.
+        """
+        dialogue_id = str(dialogue.get("dialogue_id", ""))
+        services = dialogue.get("services", [])
+        service_name = services[0] if services else None
+        schema = self.schemas.get(service_name, {})
+        service_description = schema.get("description", "")
+        # Process turns
+        turns = self._process_turns(dialogue.get("turns", []))
+        # Store metadata
+        metadata = {
+            "services": services,
+            "original_id": dialogue_id,
+        }
+        return SchemaGuidedDialogue(
+            dialogue_id=f"schema_guided_{dialogue_id}",
+            service_name=service_name,
+            service_description=service_description,
+            schema=schema,
+            turns=turns,
+            original_metadata=metadata,
+        )
+    def _validate_schema(self, schema: Dict[str, Any]) -> bool:
+        """
+        Validate a schema
+        """
+        required_keys = {"service_name", "description", "slots", "intents"}
+        missing_keys = required_keys - schema.keys()
+        if missing_keys:
+            print(f"Warning: Missing keys in schema {schema.get('service_name', 'unknown')}: {missing_keys}")
+            return False
+        return True
+    def _load_schemas(self, schema_path: str) -> Dict[str, Any]:
+        """
+        Load and process service schemas
+        """
+        with open(schema_path, 'r', encoding='utf-8') as f:
+            schemas = json.load(f)
+        # Validate and index schemas
+        return {
+            schema["service_name"]: schema for schema in schemas if self._validate_schema(schema)
+        }
+    def _process_turns(self, turns: List[Dict]) -> List[Dict]:
+        """
+        Process dialogue turns into standardized format
+        """
+        processed_turns = []
+        for turn in turns:
+            try:
+                # Map speakers to standard format
+                speaker = 'assistant' if turn.get('speaker') == 'SYSTEM' else 'user'
+                # Extract utterance and clean it
+                text = turn.get('utterance', '').strip()
+                # Extract frames and dialogue acts
+                frames = turn.get('frames', [])
+                acts = []
+                slots = []
+                for frame in frames:
+                    if 'actions' in frame:
+                        acts.extend(frame['actions'])
+                    if 'slots' in frame:
+                        slots.extend(frame['slots'])
+                # Create the processed turn
+                processed_turn = {
+                    'speaker': speaker,
+                    'text': text,
+                    'original_speaker': turn.get('speaker', ''),
+                    'dialogue_acts': acts,
+                    'slots': slots,
+                    'metadata': {k: v for k, v in turn.items()
+                            if k not in {'speaker', 'utterance', 'frames'}}
+                }
+                processed_turns.append(processed_turn)
+            except Exception as e:
+                print(f"Error processing turn: {str(e)}")
+                continue
+        return processed_turns
+    def convert_to_pipeline_format(self, schema_dialogues: List[SchemaGuidedDialogue]) -> List[Dict]:
+        """
+        Convert SchemaGuidedDialogues to the format expected by the ProcessingPipeline
+        """
+        pipeline_dialogues = []
+        for dialogue in schema_dialogues:
+            # Convert turns to the expected format
+            processed_turns = [
+                {"speaker": turn["speaker"], "text": turn["text"]}
+                for turn in dialogue.turns if turn["text"].strip()
+            ]
+            # Create dialogue in pipeline format
+            pipeline_dialogue = {
+                'dialogue_id': dialogue.dialogue_id,
+                'turns': processed_turns,
+                'metadata': {
+                    'service_name': dialogue.service_name,
+                    'service_description': dialogue.service_description,
+                    'schema': dialogue.schema,
+                    **dialogue.original_metadata
+                }
+            }
+            pipeline_dialogues.append(pipeline_dialogue)
+        return pipeline_dialogues

setup.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from setuptools import setup, find_packages
+import subprocess
+import sys
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+with open("requirements.txt", "r", encoding="utf-8") as fh:
+    requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
+def setup_spacy_model():
+    """
+    Download spaCy model.
+    """
+    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
+def setup_models():
+    """
+    Download other required models.
+    """
+    import tensorflow_hub as hub
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    from transformers import (
+        AutoTokenizer,
+        GPT2TokenizerFast,
+        MarianTokenizer
+    )
+    # Download Universal Sentence Encoder
+    _ = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
+    # Download paraphraser model
+    _ = AutoTokenizer.from_pretrained('humarin/chatgpt_paraphraser_on_T5_base')
+    # Download translation models
+    source_lang, pivot_lang, target_lang = 'en', 'de', 'es'
+    model_names = [
+        f'Helsinki-NLP/opus-mt-{source_lang}-{pivot_lang}',
+        f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}',
+        f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
+    ]
+    for model_name in model_names:
+        _ = MarianTokenizer.from_pretrained(model_name)
+    # Download GPT-2
+    _ = GPT2TokenizerFast.from_pretrained('gpt2')
+def setup_nltk():
+    """
+    Download required NLTK data.
+    """
+    import nltk
+    required_packages = [
+        'wordnet',
+        'averaged_perceptron_tagger_eng'
+    ]
+    for package in required_packages:
+        try:
+            print(f"Downloading {package}...")
+            nltk.download(package)
+            print(f"Successfully downloaded {package}")
+        except Exception as e:
+            print(f"Warning: Could not download {package}: {str(e)}")
+setup(
+    name="text-data-augmenter",
+    version="0.1.0",
+    author="Joe Armani",
+    author_email="[email protected]",
+    description="A tool for generating high-quality dialogue variations",
+    packages=find_packages(),
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Text Processing :: Linguistic",
+    ],
+    python_requires=">=3.8",
+    install_requires=requirements,
+    entry_points={
+        "console_scripts": [
+            "dialogue-augment=dialogue_augmenter.main:main",
+        ],
+    },
+    include_package_data=True,
+    package_data={
+        "dialogue_augmenter": ["data/*.json", "config/*.yaml"],
+    },
+)
+if __name__ == '__main__':
+    setup_spacy_model()
+    setup_models()
+    setup_nltk()

taskmaster_processor.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from dataclasses import dataclass, field
+from typing import List, Dict, Optional, Any
+import json
+import re
+from pathlib import Path
+from pipeline_config import PipelineConfig
+@dataclass
+class TaskmasterDialogue:
+    """
+    Structured representation of a Taskmaster dialogue
+    """
+    conversation_id: str
+    instruction_id: Optional[str]
+    scenario: Optional[str]
+    domain: Optional[str]
+    turns: List[Dict[str, Any]]
+    original_metadata: Dict[str, Any] = field(default_factory=dict)
+    def __str__(self):
+        return f"TaskmasterDialogue(conversation_id={self.conversation_id}, turns={len(self.turns)} turns)"
+    def validate(self) -> bool:
+        return bool(self.conversation_id and isinstance(self.turns, list))
+class TaskmasterProcessor:
+    """
+    Handles processing and preparation of Taskmaster dataset dialogues
+    """
+    config: PipelineConfig
+    use_ontology: bool = False  # Whether to load and use ontology
+    ontology: Optional[Dict[str, Any]] = None  # Holds ontology data if loaded
+    domains: set = field(default_factory=set)  # Tracks unique domains
+    scenarios: set = field(default_factory=set)  # Tracks unique scenarios
+    def __init__(self, config: PipelineConfig, use_ontology: bool = False):
+        self.config = config
+        self.use_ontology = use_ontology
+        self.ontology = None
+        self.domains = set()
+        self.scenarios = set()
+    def load_dataset(self, base_dir: str, max_examples: Optional[int] = None) -> List[TaskmasterDialogue]:
+        """
+        Load and parse Taskmaster JSON dataset.
+        Handles self-dialogs, woz-dialogs, and ontology files.
+        """
+        required_files = {
+            "self-dialogs": "self-dialogs.json",
+            "woz-dialogs": "woz-dialogs.json",
+            "ontology": "ontology.json",
+        }
+        # Check for required files
+        missing_files = [name for name, path in required_files.items() if not Path(base_dir, path).exists()]
+        if missing_files:
+            raise FileNotFoundError(f"Missing required taskmaster files: {missing_files}")
+        # load ontology
+        ontology_path = Path(base_dir, required_files['ontology'])
+        with open(ontology_path, 'r', encoding='utf-8') as f:
+            self.ontology = json.load(f)
+        processed_dialogues = []
+        for file_key in ["self-dialogs", "woz-dialogs"]:
+            file_path = Path(base_dir, required_files[file_key])
+            with open(file_path, 'r', encoding='utf-8') as f:
+                raw_data = json.load(f)
+            for dialogue in raw_data:
+                # Extract core dialogue components
+                conversation_id = dialogue.get('conversation_id', '')
+                instruction_id = dialogue.get('instruction_id', None)
+                if 'utterances' in dialogue:
+                    turns = self._process_utterances(dialogue['utterances'])
+                    scenario = dialogue.get('scenario', '')
+                    domain = self._extract_domain(scenario)
+                else:
+                    turns = []
+                    scenario = ''
+                    domain = ''
+                # Store metadata
+                metadata = {k: v for k, v in dialogue.items()
+                            if k not in {'conversation_id', 'instruction_id', 'utterances'}}
+                # Create structured dialogue object
+                processed_dialogue = TaskmasterDialogue(
+                    conversation_id=conversation_id,
+                    instruction_id=instruction_id,
+                    scenario=scenario,
+                    domain=domain,
+                    turns=turns,
+                    original_metadata=metadata
+                )
+                processed_dialogues.append(processed_dialogue)
+                # Update domain and scenario tracking
+                if domain:
+                    self.domains.add(domain)
+                if scenario:
+                    self.scenarios.add(scenario)
+                if max_examples and len(processed_dialogues) >= max_examples:
+                    break
+        return processed_dialogues
+    def _process_utterances(self, utterances: List[Dict]) -> List[Dict]:
+        """
+        Process utterances into a standardized format
+        """
+        processed_turns = []
+        for utterance in utterances:
+            # Map Taskmaster speaker roles to your expected format
+            speaker = 'assistant' if utterance.get('speaker') == 'ASSISTANT' else 'user'
+            # Extract and clean the text
+            text = utterance.get('text', '').strip()
+            # Extract any segments or annotations if present
+            segments = utterance.get('segments', [])
+            # Create the processed turn
+            turn = {
+                'speaker': speaker,
+                'text': text,
+                'original_speaker': utterance.get('speaker', ''),
+                'segments': segments,
+                'metadata': {k: v for k, v in utterance.items()
+                           if k not in {'speaker', 'text', 'segments'}}
+            }
+            processed_turns.append(turn)
+        return processed_turns
+    def _extract_domain(self, scenario: str) -> str:
+        """
+        Extract domain from scenario description
+        """
+        domain_patterns = {
+            'restaurant': r'\b(restaurant|dining|food|reservation)\b',
+            'movie': r'\b(movie|cinema|film|ticket)\b',
+            'ride_share': r'\b(ride|taxi|uber|lyft)\b',
+            'coffee': r'\b(coffee|café|cafe|starbucks)\b',
+            'pizza': r'\b(pizza|delivery|order food)\b',
+            'auto': r'\b(car|vehicle|repair|maintenance)\b',
+        }
+        scenario_lower = scenario.lower()
+        for domain, pattern in domain_patterns.items():
+            if re.search(pattern, scenario_lower):
+                return domain
+        return 'other'
+    def convert_to_pipeline_format(self, taskmaster_dialogues: List[TaskmasterDialogue]) -> List[Dict]:
+        """
+        Convert TaskmasterDialogues to the format expected by the ProcessingPipeline
+        """
+        pipeline_dialogues = []
+        for dialogue in taskmaster_dialogues:
+            # Convert turns to the expected format
+            processed_turns = []
+            for turn in dialogue.turns:
+                if turn['text'].strip():  # Skip empty turns
+                    processed_turns.append({
+                        'speaker': turn['speaker'],
+                        'text': turn['text']
+                    })
+            # Create dialogue in pipeline format
+            pipeline_dialogue = {
+                'dialogue_id': dialogue.conversation_id,
+                'turns': processed_turns,
+                'metadata': {
+                    'instruction_id': dialogue.instruction_id,
+                    'scenario': dialogue.scenario,
+                    'domain': dialogue.domain,
+                    **dialogue.original_metadata
+                }
+            }
+            pipeline_dialogues.append(pipeline_dialogue)
+        return pipeline_dialogues