import json | |
from datetime import datetime | |
from pathlib import Path | |
from pipeline_config import PipelineConfig | |
from taskmaster_processor import TaskmasterProcessor | |
def main(): | |
# 1) Setup config | |
config = PipelineConfig( | |
max_length=512, | |
min_turns=3, | |
min_user_words=3, | |
debug=True | |
) | |
# 2) Instantiate processor | |
base_dir = "datasets/taskmaster" | |
processor = TaskmasterProcessor(config) | |
# 3) Load raw dialogues | |
dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None) | |
# 4) Filter & convert to final structure | |
final_dialogues = processor.filter_and_convert(dialogues) | |
# 5) Save final data | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
output_dir = Path("processed_outputs") | |
output_dir.mkdir(parents=True, exist_ok=True) | |
out_file = output_dir / f"taskmaster_only_{timestamp}.json" | |
with open(out_file, 'w', encoding='utf-8') as f: | |
json.dump(final_dialogues, f, indent=2) | |
print(f"[Taskmaster Only] Kept {len(final_dialogues)} dialogues => {out_file}") | |
if __name__ == "__main__": | |
main() |