Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Dec 14, 2024

Commit

bc503de

1 Parent(s): febdb1e

update gpu processing

Browse files

Files changed (4) hide show

.gitignore +3 -0
dialogue_augmenter.py +69 -232
main.py +7 -19
processing_pipeline.py +143 -40

.gitignore CHANGED Viewed

@@ -156,3 +156,6 @@ cython_debug/
 datasets/*
 !datasets/.gitkeep

 datasets/*
 !datasets/.gitkeep
+processed_outputs/*
+!processed_outputs/.gitkeep

dialogue_augmenter.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Dict, List
 import numpy as np
 import tensorflow as tf
 import tensorflow_hub as hub
 import re
@@ -19,13 +20,53 @@ class DialogueAugmenter:
     def __init__(self, nlp, config: PipelineConfig):
         self.nlp = nlp
         self.config = config
         self.quality_metrics = QualityMetrics(config)
         self.use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
         # Advanced augmentation techniques
         self.paraphraser = Paraphraser()
         self.back_translator = BackTranslator()
         # Basic augmentation techniques
         self.word_augmenter = naw.SynonymAug(aug_src='wordnet')
         self.spelling_augmenter = naw.SpellingAug()
@@ -37,63 +78,62 @@ class DialogueAugmenter:
                 ('spelling', self.spelling_augmenter)
             ]
         }
-        # Initialize cache
-        self.embedding_cache = {}
-        self.perplexity_cache = {}
-        # Compile regex patterns
-        self.spelling_pattern = re.compile(r'[a-zA-Z]{3,}')
-        # GPU memory management
-        gpus = tf.config.list_physical_devices('GPU')
-        if gpus:
-            try:
-                for gpu in gpus:
-                    tf.config.experimental.set_memory_growth(gpu, True)
-            except RuntimeError as e:
-                print(e)
     @lru_cache(maxsize=1024)
     def _compute_embedding(self, text: str) -> np.ndarray:
         """Cached computation of text embedding"""
-        return self.use_model([text])[0].numpy()
     def _compute_batch_embeddings(self, texts: List[str]) -> np.ndarray:
-        """Compute embeddings for multiple texts at once"""
-        return self.use_model(texts).numpy()
     def _quick_quality_check(self, variation: str, original: str) -> bool:
         """
-        Simplified preliminary quality check with minimal standards
         """
         if self.config.debug:
             print(f"\nQuick check for variation: {variation}")
-        # Only reject if length is extremely different
         orig_len = len(original.split())
         var_len = len(variation.split())
-        # For very short texts (1-3 words), allow more variation
         if orig_len <= 3:
-            if var_len > orig_len * 4:  # Allow up to 4x length for short texts
                 if self.config.debug:
                     print(f"Failed length check (short text): {var_len} vs {orig_len}")
                 return False
         else:
-            if var_len > orig_len * 3:  # Allow up to 3x length for longer texts
                 if self.config.debug:
                     print(f"Failed length check (long text): {var_len} vs {orig_len}")
                 return False
-        # Basic content check - at least one word in common (excluding stop words)
-        stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'is', 'are'}
         orig_words = set(w.lower() for w in original.split() if w.lower() not in stop_words)
         var_words = set(w.lower() for w in variation.split() if w.lower() not in stop_words)
-        if not orig_words.intersection(var_words):
             if self.config.debug:
-                print("Failed content check: no content words in common")
             return False
         if self.config.debug:
@@ -401,69 +441,6 @@ class DialogueAugmenter:
         text1 = " ".join(turn['text'] for turn in dialogue1['turns'])
         text2 = " ".join(turn['text'] for turn in dialogue2['turns'])
         return text1 == text2
-    # def _augment_turn(self, turn: Dict, context: List[str]) -> List[Dict]:
-    #     """
-    #     Generate augmented versions of the turn using multiple strategies.
-    #     """
-    #     text = turn['text']
-    #     words = text.split()
-    #     # Special handling for very short texts
-    #     if len(words) < 3:
-    #         return self._augment_short_text(turn)
-    #     all_variations = set()
-    #     # Advanced augmentations (paraphrase and back-translation)
-    #     for augmenter in self.augmenters['advanced']:
-    #         try:
-    #             if isinstance(augmenter, Paraphraser):
-    #                 variations = augmenter.paraphrase(text)
-    #                 all_variations.update(variations)
-    #             elif isinstance(augmenter, BackTranslator):
-    #                 aug_text = augmenter.back_translate(text)
-    #                 if aug_text:
-    #                     all_variations.add(aug_text)
-    #         except Exception as e:
-    #             print(f"Error in advanced augmentation: {str(e)}")
-    #             continue
-    #     # Basic nlpaug augmentations
-    #     for aug_type, augmenter in self.augmenters['basic']:
-    #         try:
-    #             if aug_type == 'spelling' and self._is_technical_or_formal_text(text):
-    #                 continue
-    #             aug_texts = augmenter.augment(text, n=2)
-    #             if isinstance(aug_texts, list):
-    #                 all_variations.update(aug_texts)
-    #             else:
-    #                 all_variations.add(aug_texts)
-    #         except Exception as e:
-    #             print(f"Error in {aug_type} augmentation: {str(e)}")
-    #             continue
-    #     # Remove exact duplicates and empty strings
-    #     augmented_texts = [t for t in list(all_variations) if t.strip()]
-    #     # Apply context filtering
-    #     if context:
-    #         augmented_texts = self._filter_by_context(augmented_texts, context)
-    #         print(f"After context filtering: {len(augmented_texts)} variations")
-    #     # Select best variations
-    #     best_variations = self._select_best_augmentations(
-    #         text,
-    #         augmented_texts,
-    #         num_to_select=self.config.augmentation_factor,
-    #         min_quality_score=0.7
-    #     )
-    #     # Create variations with speaker info
-    #     variations = [{'speaker': turn['speaker'], 'text': text} for text in best_variations]
-    #     return variations
     def _augment_short_text(self, turn: Dict) -> List[Dict]:
         """
@@ -574,143 +551,3 @@ class DialogueAugmenter:
                 return True
         return False
-    # def _filter_by_context(self, variations: List[str], context: List[str]) -> List[str]:
-    #     """
-    #     Filter variations based on conversation context using config parameters.
-    #     """
-    #     # Manage context window using config
-    #     recent_context = context[-self.config.context_window_size:] if len(context) > self.config.context_window_size else context
-    #     filtered_variations = []
-    #     context_embedding = self.use_model([' '.join(recent_context)])[0].numpy()
-    #     prev_turn = recent_context[-1] if recent_context else ''
-    #     for variation in variations:
-    #         var_embedding = self.use_model([variation])[0].numpy()
-    #         # Overall context similarity
-    #         context_similarity = cosine_similarity([context_embedding], [var_embedding])[0][0]
-    #         # Direct response coherence
-    #         response_coherence = 1.0
-    #         if prev_turn:
-    #             prev_embedding = self.use_model([prev_turn])[0].numpy()
-    #             response_coherence = cosine_similarity([prev_embedding], [var_embedding])[0][0]
-    #         # Use weights from config
-    #         combined_similarity = (
-    #             self.config.context_similarity_weight * context_similarity +
-    #             self.config.response_coherence_weight * response_coherence
-    #         )
-    #         if (combined_similarity >= self.config.semantic_similarity_threshold and
-    #             response_coherence >= self.config.min_response_coherence):
-    #             filtered_variations.append(variation)
-    #             if self.config.debug:
-    #                 print(f"Accepted variation: {variation}")
-    #                 print(f"Context similarity: {context_similarity:.3f}")
-    #                 print(f"Response coherence: {response_coherence:.3f}")
-    #                 print(f"Combined score: {combined_similarity:.3f}\n")
-    #         else:
-    #             if self.config.debug:
-    #                 print(f"Rejected variation: {variation}")
-    #                 print(f"Combined score {combined_similarity:.3f} below threshold "
-    #                     f"{self.config.semantic_similarity_threshold}")
-    #                 print(f"Response coherence {response_coherence:.3f} below threshold "
-    #                     f"{self.config.min_response_coherence}\n")
-    #     return filtered_variations or variations  # Fallback to original
-    # def _select_best_augmentations(self, original: str, candidates: List[str], used_variations: set = None,
-    #                               num_to_select: int = 3, min_quality_score: float = 0.7) -> List[str]:
-    #     """
-    #     Select the best augmentations using a quality score.
-    #     Args:
-    #         original (str): The original text
-    #         candidates (List[str]): List of candidate augmented texts
-    #         used_variations (set): Set of already used variations
-    #         num_to_select (int): Number of variations to select
-    #         min_quality_score (float): Minimum quality score threshold
-    #     """
-    #     if used_variations is None:
-    #         used_variations = set()
-    #     candidates = [c for c in candidates if c.strip()]
-    #     # Skip short text
-    #     if len(original.split()) < 3:
-    #         print(f"Text too short for augmentation: {original}")
-    #         return [original]
-    #     scored_candidates = []
-    #     for candidate in candidates:
-    #         if candidate in used_variations:
-    #             continue
-    #         metrics = self.quality_metrics.compute_metrics(original, candidate)
-    #         # Add contextual penalty for inappropriate audience terms
-    #         audience_terms = {'everyone', 'everybody', 'folks', 'all', 'guys', 'people'}
-    #         has_audience_term = any(term in candidate.lower() for term in audience_terms)
-    #         audience_penalty = 0.2 if has_audience_term else 0.0
-    #         # Weighted quality score
-    #         quality_score = (
-    #             0.40 * metrics['semantic_similarity'] +          # Semantic preservation
-    #             0.25 * (1.0 - metrics['perplexity'] / 100) +     # Fluency
-    #             0.15 * (1.0 - metrics['grammar_errors'] / 10) +  # Grammar
-    #             0.15 * metrics['content_preservation'] +         # Content preservation
-    #             0.05 * metrics['type_token_ratio']               # Lexical diversity
-    #         )
-    #         quality_score -= audience_penalty
-    #         if (metrics['semantic_similarity'] < 0.5 or     # Reject on semantic threshold miss
-    #             metrics['rouge1_f1'] < 0.2):                # Enforce minimum lexical overlap
-    #             continue
-    #         # Bonus points for:
-    #         # Length similarity to original
-    #         if 0.75 <= metrics['length_ratio'] <= 1.25:
-    #             quality_score += 0.05
-    #         # Correct grammar
-    #         if metrics['grammar_errors'] == 0:
-    #             quality_score += 0.025
-    #         print(f"Candidate: {candidate}")
-    #         print(f"Quality score: {quality_score:.2f}, Metrics: {metrics}")
-    #         # Consider the augmentationif meets basic quality threshold
-    #         if quality_score >= min_quality_score:
-    #             print('Candidate accepted\n')
-    #             scored_candidates.append((candidate, quality_score, metrics))
-    #         else:
-    #             print('Candidate rejected\n')
-    #     # Sort by quality score with small random factor for diversity
-    #     scored_candidates.sort(key=lambda x: x[1], reverse=True)
-    #     selected = []
-    #     for candidate, score, metrics in scored_candidates:
-    #         # Check diversity against already selected
-    #         if len(selected) == 0:
-    #             selected.append(candidate)
-    #             continue
-    #         # Compute average similarity to already selected
-    #         avg_similarity = np.mean([
-    #             self.quality_metrics.compute_semantic_similarity(candidate, prev)
-    #             for prev in selected
-    #         ])
-    #         # Add if sufficiently different (similarity < 0.98)
-    #         if avg_similarity < 0.98:
-    #             selected.append(candidate)
-    #         if len(selected) >= num_to_select:
-    #             break
-    #     return selected

 from typing import Dict, List
 import numpy as np
+import torch
 import tensorflow as tf
 import tensorflow_hub as hub
 import re
     def __init__(self, nlp, config: PipelineConfig):
         self.nlp = nlp
         self.config = config
+        # Detect hardware and set appropriate batch sizes and optimization strategy
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.use_gpu = torch.cuda.is_available()
+        if self.config.debug:
+            print(f"Using device: {self.device}")
+            if self.use_gpu:
+                print(f"GPU Device: {torch.cuda.get_device_name(0)}")
+        # Load base models
         self.quality_metrics = QualityMetrics(config)
         self.use_model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
+        # Initialize augmentation models based on hardware
+        self._initialize_augmentation_models()
+        # Initialize caches
+        self.embedding_cache = {}
+        self.perplexity_cache = {}
+        # Compile regex patterns
+        self.spelling_pattern = re.compile(r'[a-zA-Z]{3,}')
+        # GPU memory management if available
+        if self.use_gpu:
+            gpus = tf.config.list_physical_devices('GPU')
+            if gpus:
+                try:
+                    for gpu in gpus:
+                        tf.config.experimental.set_memory_growth(gpu, True)
+                except RuntimeError as e:
+                    print(e)
+    def _initialize_augmentation_models(self):
+        """Initialize augmentation models with appropriate device settings"""
         # Advanced augmentation techniques
         self.paraphraser = Paraphraser()
         self.back_translator = BackTranslator()
+        if self.use_gpu:
+            # Move models to GPU if available
+            self.paraphraser.model = self.paraphraser.model.to(self.device)
+            self.back_translator.model_pivot_forward = self.back_translator.model_pivot_forward.to(self.device)
+            self.back_translator.model_pivot_backward = self.back_translator.model_pivot_backward.to(self.device)
+            self.back_translator.model_backward = self.back_translator.model_backward.to(self.device)
         # Basic augmentation techniques
         self.word_augmenter = naw.SynonymAug(aug_src='wordnet')
         self.spelling_augmenter = naw.SpellingAug()
                 ('spelling', self.spelling_augmenter)
             ]
         }
     @lru_cache(maxsize=1024)
     def _compute_embedding(self, text: str) -> np.ndarray:
         """Cached computation of text embedding"""
+        if text in self.embedding_cache:
+            return self.embedding_cache[text]
+        embedding = self.use_model([text])[0].numpy()
+        self.embedding_cache[text] = embedding
+        return embedding
     def _compute_batch_embeddings(self, texts: List[str]) -> np.ndarray:
+        """Compute embeddings for multiple texts at once with hardware optimization"""
+        # Check cache first
+        uncached_texts = [t for t in texts if t not in self.embedding_cache]
+        if uncached_texts:
+            embeddings = self.use_model(uncached_texts).numpy()
+            # Update cache
+            for text, embedding in zip(uncached_texts, embeddings):
+                self.embedding_cache[text] = embedding
+        # Return all embeddings (from cache or newly computed)
+        return np.array([self.embedding_cache[t] for t in texts])
     def _quick_quality_check(self, variation: str, original: str) -> bool:
         """
+        Stricter preliminary quality check while maintaining reasonable pass rates
         """
         if self.config.debug:
             print(f"\nQuick check for variation: {variation}")
+        # Stricter length check
         orig_len = len(original.split())
         var_len = len(variation.split())
+        # For very short texts (1-3 words), still allow more variation
         if orig_len <= 3:
+            if var_len > orig_len * 3:  # Reduced from 4x to 3x
                 if self.config.debug:
                     print(f"Failed length check (short text): {var_len} vs {orig_len}")
                 return False
         else:
+            if var_len > orig_len * 2:  # Reduced from 3x to 2x
                 if self.config.debug:
                     print(f"Failed length check (long text): {var_len} vs {orig_len}")
                 return False
+        # Enhanced content check - more words in common
+        stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'is', 'are', 'that', 'this', 'will', 'can'}
         orig_words = set(w.lower() for w in original.split() if w.lower() not in stop_words)
         var_words = set(w.lower() for w in variation.split() if w.lower() not in stop_words)
+        # Require more content word overlap
+        content_overlap = len(orig_words.intersection(var_words)) / len(orig_words) if orig_words else 0
+        if content_overlap < 0.3:  # Increased from no minimum to 30% overlap
             if self.config.debug:
+                print(f"Failed content check: overlap {content_overlap:.2f}")
             return False
         if self.config.debug:
         text1 = " ".join(turn['text'] for turn in dialogue1['turns'])
         text2 = " ".join(turn['text'] for turn in dialogue2['turns'])
         return text1 == text2
     def _augment_short_text(self, turn: Dict) -> List[Dict]:
         """
                 return True
         return False

main.py CHANGED Viewed

@@ -65,28 +65,26 @@ def main():
         context_window_size=4,
         max_complexity_threshold=100,
         use_cache=False,
-        debug=True,
         allowed_speakers=['user', 'assistant'],
         required_fields=['dialogue_id', 'turns']
     )
     try:
         # Set max_examples (Optional[int]) for testing
-        max_examples = None
         # Initialize and load Taskmaster dataset
         print("Loading Taskmaster dataset")
         taskmaster_processor = TaskmasterProcessor(config, use_ontology=False)
-        taskmaster_dir = './datasets/taskmaster'
-        taskmaster_dialogues = taskmaster_processor.load_dataset(taskmaster_dir, max_examples=max_examples)
         taskmaster_pipeline_dialogues = taskmaster_processor.convert_to_pipeline_format(taskmaster_dialogues)
         print(f"Processed Taskmaster dialogues: {len(taskmaster_pipeline_dialogues)}")
         # Initialize and load Schema-Guided dataset
         print("Loading Schema-Guided dataset")
         schema_dialogue_processor = SchemaGuidedProcessor(config)
-        schema_guided_dir = './datasets/schema_guided'
-        schema_dialogues = schema_dialogue_processor.load_dataset(schema_guided_dir, max_examples=max_examples)
         schema_pipeline_dialogues = schema_dialogue_processor.convert_to_pipeline_format(schema_dialogues)
         print(f"Processed Schema-Guided dialogues: {len(schema_pipeline_dialogues)}")
@@ -102,19 +100,9 @@ def main():
         # Process through augmentation pipeline
         print("Processing combined dataset")
         pipeline = ProcessingPipeline(config)
-        processed_dialogues = pipeline.process_dataset(combined_dialogues)
-        # Save results
-        output_path = 'augmented_combined_dataset.json'
-        with open(output_path, 'w', encoding='utf-8') as f:
-            json.dump(processed_dialogues, f, indent=2, ensure_ascii=False)
-        # Print statistics
-        print(f"\nProcessed Statistics:")
-        print(f"Total dialogues: {len(processed_dialogues)}")
-        print(f"Taskmaster domains: {len(taskmaster_processor.domains)}")
-        print(f"Schema-Guided services: {len(schema_dialogue_processor.services)}")
-        print(f"Schema-Guided domains: {len(schema_dialogue_processor.domains)}")
     except Exception as e:
         print(f"Processing failed: {str(e)}")

         context_window_size=4,
         max_complexity_threshold=100,
         use_cache=False,
+        debug=False,
         allowed_speakers=['user', 'assistant'],
         required_fields=['dialogue_id', 'turns']
     )
     try:
         # Set max_examples (Optional[int]) for testing
+        max_examples = 5
         # Initialize and load Taskmaster dataset
         print("Loading Taskmaster dataset")
         taskmaster_processor = TaskmasterProcessor(config, use_ontology=False)
+        taskmaster_dialogues = taskmaster_processor.load_dataset('./datasets/taskmaster', max_examples=max_examples)
         taskmaster_pipeline_dialogues = taskmaster_processor.convert_to_pipeline_format(taskmaster_dialogues)
         print(f"Processed Taskmaster dialogues: {len(taskmaster_pipeline_dialogues)}")
         # Initialize and load Schema-Guided dataset
         print("Loading Schema-Guided dataset")
         schema_dialogue_processor = SchemaGuidedProcessor(config)
+        schema_dialogues = schema_dialogue_processor.load_dataset('./datasets/schema_guided', max_examples=max_examples)
         schema_pipeline_dialogues = schema_dialogue_processor.convert_to_pipeline_format(schema_dialogues)
         print(f"Processed Schema-Guided dialogues: {len(schema_pipeline_dialogues)}")
         # Process through augmentation pipeline
         print("Processing combined dataset")
         pipeline = ProcessingPipeline(config)
+        output_path = pipeline.process_dataset(combined_dialogues)
+        print(f"Processing complete. Results saved to {output_path}")
+        pipeline.cleanup()
     except Exception as e:
         print(f"Processing failed: {str(e)}")

processing_pipeline.py CHANGED Viewed

@@ -4,13 +4,15 @@ from typing import List, Dict, Optional
 import json
 import re
 import hashlib
-import pickle
 import spacy
 from tqdm import tqdm
 from pipeline_config import PipelineConfig
 from dialogue_augmenter import DialogueAugmenter
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 class ProcessingPipeline:
     """
@@ -24,50 +26,151 @@ class ProcessingPipeline:
         self.num_threads = self.config.batch_size
         self.cache_dir = Path("./cache")
         self.cache_dir.mkdir(exist_ok=True)
-    def process_dataset(self, dialogues: List[Dict]) -> List[Dict]:
-        """
-        Process entire dataset through the pipeline.
-        """
-        print(f"Processing {len(dialogues)} dialogues")
-        start_time = datetime.now()
-        # Check cache
-        if self.config.use_cache:
-            cache_path = self._get_cache_path(dialogues)
-            if cache_path.exists():
-                print("Loading from cache...")
-                with open(cache_path, 'rb') as f:
-                    return pickle.load(f)
-        # Validate and clean
-        valid_dialogues = self._process_validation(
-            dialogues,
-            self._validate_and_clean_dialogue,
-            "validating and cleaning"
-        )
-        if not valid_dialogues:
-            raise ValueError("Dialogue validation resulted in an empty dataset.")
-        deduplicated_dialogues = self._deduplicate_dialogues(valid_dialogues)
-        # Augment dialogues
-        all_processed_dialogues = []
-        for dialogue in deduplicated_dialogues:
-            augmented = self.augmenter.augment_dialogue(dialogue)
-            all_processed_dialogues.extend(augmented)
-        # Save to cache
-        if self.config.use_cache:
-            with open(cache_path, 'wb') as f:
-                pickle.dump(all_processed_dialogues, f)
-        processing_time = datetime.now() - start_time
-        print(f"Processing completed in {processing_time}")
-        print(f"Generated {len(all_processed_dialogues)} total dialogues")
-        return all_processed_dialogues
     def _deduplicate_dialogues(self, dialogues: List[Dict], threshold: float = 0.9) -> List[Dict]:
         """

 import json
 import re
 import hashlib
 import spacy
+import torch
 from tqdm import tqdm
 from pipeline_config import PipelineConfig
 from dialogue_augmenter import DialogueAugmenter
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+from concurrent.futures import ProcessPoolExecutor
+from typing import Set
 class ProcessingPipeline:
     """
         self.num_threads = self.config.batch_size
         self.cache_dir = Path("./cache")
         self.cache_dir.mkdir(exist_ok=True)
+        self.output_dir = Path("processed_outputs")
+        self.output_dir.mkdir(exist_ok=True)
+        self.checkpoint_file = self.output_dir / "processing_checkpoint.json"
+        self.batch_size = self.config.batch_size
+        self.use_gpu = torch.cuda.is_available()
+        self.batch_size = 32 if self.use_gpu else 8
+        self.use_multiprocessing = not self.use_gpu
+        if self.config.debug:
+            print(f"ProcessingPipeline initialized with:")
+            print(f"- GPU available: {self.use_gpu}")
+            print(f"- Batch size: {self.batch_size}")
+            print(f"- Using multiprocessing: {self.use_multiprocessing}")
+    def _save_batch(self, batch_results: List[Dict], batch_num: int) -> Path:
+        """Save a batch of results to a separate JSON file"""
+        batch_file = self.output_dir / f"batch_{batch_num:04d}.json"
+        with open(batch_file, 'w') as f:
+            json.dump(batch_results, f)
+        return batch_file
+    def _load_checkpoint(self) -> set:
+        """Load set of processed dialogue IDs from checkpoint"""
+        if self.checkpoint_file.exists():
+            with open(self.checkpoint_file, 'r') as f:
+                return set(json.load(f))
+        return set()
+    def _update_checkpoint(self, processed_ids: set):
+        """Update checkpoint with newly processed IDs"""
+        with open(self.checkpoint_file, 'w') as f:
+            json.dump(list(processed_ids), f)
+    def _process_batch(self, batch: List[Dict]) -> List[Dict]:
+        """Process batch with optimized model calls"""
+        results = []
+        try:
+            if self.use_gpu:
+                results = self.augmenter.process_batch(batch)
+            else:
+                # Collect all texts that need processing
+                all_texts = []
+                text_to_dialogue_map = {}
+                for dialogue in batch:
+                    for turn in dialogue['turns']:
+                        all_texts.append(turn['text'])
+                        text_to_dialogue_map[turn['text']] = dialogue['dialogue_id']
+                # Batch process embeddings
+                embeddings = self.augmenter._compute_batch_embeddings(all_texts)
+                # Process dialogues with cached embeddings
+                for dialogue in batch:
+                    try:
+                        augmented = self.augmenter.augment_dialogue(dialogue)
+                        results.extend(augmented)
+                    except Exception as e:
+                        print(f"Error processing dialogue {dialogue.get('dialogue_id', 'unknown')}: {str(e)}")
+                        continue
+        except Exception as e:
+            print(f"Error processing batch: {str(e)}")
+        return results
+    def combine_results(self) -> Path:
+        """Combine all batch files into final output"""
+        all_results = []
+        batch_files = sorted(self.output_dir.glob("batch_*.json"))
+        print(f"Combining {len(batch_files)} batch files...")
+        for batch_file in tqdm(batch_files):
+            with open(batch_file, 'r') as f:
+                batch_data = json.load(f)
+                all_results.extend(batch_data)
+        # Save combined results
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        final_output = self.output_dir / f"augmented_dataset_{timestamp}.json"
+        with open(final_output, 'w') as f:
+            json.dump(all_results, f)
+        if self.config.debug:
+            print(f"Combined {len(all_results)} dialogues into {final_output}")
+        return final_output
+    def process_dataset(self, dialogues: List[Dict]) -> Path:
+        """Process dataset with hardware-appropriate optimizations and progress tracking"""
+        processed_ids = self._load_checkpoint()
+        # Filter out already processed dialogues
+        remaining_dialogues = [d for d in dialogues
+                            if d['dialogue_id'] not in processed_ids]
+        total_dialogues = len(dialogues)
+        remaining_count = len(remaining_dialogues)
+        processed_count = total_dialogues - remaining_count
+        print("\nDataset Processing Status:")
+        print(f"Total dialogues in dataset: {total_dialogues}")
+        print(f"Previously processed: {processed_count}")
+        print(f"Remaining to process: {remaining_count}")
+        print("-" * 50)
+        # Process in batches with progress bar
+        for batch_num in tqdm(range(0, len(remaining_dialogues), self.batch_size),
+                            desc="Processing batches",
+                            total=(len(remaining_dialogues) + self.batch_size - 1) // self.batch_size):
+            batch = remaining_dialogues[batch_num:batch_num + self.batch_size]
+            current_position = processed_count + batch_num + len(batch)
+            total_progress = (current_position / total_dialogues) * 100
+            batch_progress = (batch_num + 1) / ((len(remaining_dialogues) + self.batch_size - 1) // self.batch_size) * 100
+            print(f"\rProgress: {current_position}/{total_dialogues} dialogues "
+                f"({total_progress:.1f}% complete) - "
+                f"Batch {batch_num//self.batch_size + 1} of "
+                f"{(len(remaining_dialogues) + self.batch_size - 1) // self.batch_size}", end="")
+            # Process batch
+            batch_results = self._process_batch(batch)
+            if batch_results:
+                self._save_batch(batch_results, batch_num)
+                batch_ids = {d['dialogue_id'] for d in batch}
+                processed_ids.update(batch_ids)
+                self._update_checkpoint(processed_ids)
+        print("\n" + "-" * 50)
+        print("Processing complete. Combining results...")
+        return self.combine_results()
+    def cleanup(self):
+        """Clean up intermediate batch files after successful processing"""
+        batch_files = list(self.output_dir.glob("batch_*.json"))
+        for file in batch_files:
+            try:
+                file.unlink()
+            except Exception as e:
+                print(f"Error deleting {file}: {e}")
+        if self.checkpoint_file.exists():
+            try:
+                self.checkpoint_file.unlink()
+            except Exception as e:
+                print(f"Error deleting checkpoint file: {e}")
     def _deduplicate_dialogues(self, dialogues: List[Dict], threshold: float = 0.9) -> List[Dict]:
         """