""" CSC525 - Module 8 Option 2 - Joseph Armani Description and References in the README.md file. """ import json import tensorflow as tf from typing import List, Dict from pipeline_config import PipelineConfig from processing_pipeline import ProcessingPipeline from taskmaster_processor import TaskmasterProcessor from schema_guided_dialogue_processor import SchemaGuidedProcessor def combine_datasets(taskmaster_dialogues: List[Dict], schema_guided_dialogues: List[Dict]) -> List[Dict]: """ Combine dialogues from both datasets into a single list Args: taskmaster_dialogues: List of dialogues in pipeline format from Taskmaster schema_guided_dialogues: List of dialogues in pipeline format from Schema-Guided Returns: List[Dict]: Combined list of dialogues """ # Ensure unique dialogue IDs combined_dialogues = [] seen_ids = set() duplicate_count = 0 # Track duplicates for reporting for dialogue in taskmaster_dialogues: dialogue_copy = dialogue.copy() dialogue_id = dialogue_copy['dialogue_id'] if dialogue_id in seen_ids: duplicate_count += 1 dialogue_id = f"taskmaster_{dialogue_id}" seen_ids.add(dialogue_id) dialogue_copy['dialogue_id'] = dialogue_id combined_dialogues.append(dialogue_copy) for dialogue in schema_guided_dialogues: dialogue_copy = dialogue.copy() dialogue_id = dialogue_copy['dialogue_id'] if dialogue_id in seen_ids: duplicate_count += 1 dialogue_id = f"schema_guided_{dialogue_id}" seen_ids.add(dialogue_id) dialogue_copy['dialogue_id'] = dialogue_id combined_dialogues.append(dialogue_copy) # Log the results print(f"Combine Datasets: Found and resolved {duplicate_count} duplicate dialogue IDs.") print(f"Combine Datasets: Total dialogues combined: {len(combined_dialogues)}") return combined_dialogues def main(): # Configuration config = PipelineConfig( min_length=1, max_length=512, batch_size=32 if tf.config.list_physical_devices('GPU') else 16, max_turns_per_dialogue=12, max_variations_per_turn=4, max_sampled_variations=2, context_window_size=4, max_complexity_threshold=100, use_cache=False, debug=True, allowed_speakers=['user', 'assistant'], required_fields=['dialogue_id', 'turns'] ) try: # Set max_examples (Optional[int]) for testing max_examples = 5 # Initialize and load Taskmaster dataset print("Loading Taskmaster dataset") taskmaster_processor = TaskmasterProcessor(config, use_ontology=False) taskmaster_dialogues = taskmaster_processor.load_dataset('./datasets/taskmaster', max_examples=max_examples) taskmaster_pipeline_dialogues = taskmaster_processor.convert_to_pipeline_format(taskmaster_dialogues) print(f"Processed Taskmaster dialogues: {len(taskmaster_pipeline_dialogues)}") # Initialize and load Schema-Guided dataset print("Loading Schema-Guided dataset") schema_dialogue_processor = SchemaGuidedProcessor(config) schema_dialogues = schema_dialogue_processor.load_dataset('./datasets/schema_guided', max_examples=max_examples) schema_pipeline_dialogues = schema_dialogue_processor.convert_to_pipeline_format(schema_dialogues) print(f"Processed Schema-Guided dialogues: {len(schema_pipeline_dialogues)}") # Combine datasets print("Combining datasets") combined_dialogues = combine_datasets(taskmaster_pipeline_dialogues, schema_pipeline_dialogues) print(f"Combined Dialogues: {len(combined_dialogues)}") if not combined_dialogues: print("Combined dialogues are empty. Exiting.") return # Process through augmentation pipeline print("Processing combined dataset") pipeline = ProcessingPipeline(config) output_path = pipeline.process_dataset(combined_dialogues) print(f"Processing complete. Results saved to {output_path}") pipeline.cleanup() except Exception as e: print(f"Processing failed: {str(e)}") raise if __name__ == "__main__": main()