import json from datetime import datetime from pathlib import Path from taskmaster_processor import TaskmasterProcessor, RawDataProcessingConfig def main(): # Setup config and processor base_dir = "raw_datasets/taskmaster" config = RawDataProcessingConfig( debug=True, max_length=512, min_turns=4, min_user_words=3 ) processor = TaskmasterProcessor(config) # Load dialogues dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None) # Filter and convert dialogues final_dialogues = processor.filter_and_convert(dialogues) # Save processed dialogues timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = Path("processed_outputs") output_dir.mkdir(parents=True, exist_ok=True) out_file = output_dir / f"taskmaster_only_{timestamp}.json" with open(out_file, 'w', encoding='utf-8') as f: json.dump(final_dialogues, f, indent=2) print(f"[Taskmaster Only] Kept {len(final_dialogues)} dialogues => {out_file}") if __name__ == "__main__": main()