import json from datetime import datetime from pathlib import Path from data_augmentation.pipeline_config import PipelineConfig from data_augmentation.taskmaster_processor import TaskmasterProcessor def main(): # 1) Setup config config = PipelineConfig( max_length=512, min_turns=4, min_user_words=3, debug=True ) # 2) Instantiate processor base_dir = "datasets/taskmaster" processor = TaskmasterProcessor(config) # 3) Load raw dialogues dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None) # 4) Filter & convert to final structure final_dialogues = processor.filter_and_convert(dialogues) # 5) Save final data timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = Path("processed_outputs") output_dir.mkdir(parents=True, exist_ok=True) out_file = output_dir / f"taskmaster_only_{timestamp}.json" with open(out_file, 'w', encoding='utf-8') as f: json.dump(final_dialogues, f, indent=2) print(f"[Taskmaster Only] Kept {len(final_dialogues)} dialogues => {out_file}") if __name__ == "__main__": main()