csc525_retrieval_based_chatbot / new_iteration /run_taskmaster_processor.py
JoeArmani
restructuring
71ca212
raw
history blame
1.19 kB
import json
from datetime import datetime
from pathlib import Path
from data_augmentation.pipeline_config import PipelineConfig
from data_augmentation.taskmaster_processor import TaskmasterProcessor
def main():
# 1) Setup config
config = PipelineConfig(
max_length=512,
min_turns=4,
min_user_words=3,
debug=True
)
# 2) Instantiate processor
base_dir = "datasets/taskmaster"
processor = TaskmasterProcessor(config)
# 3) Load raw dialogues
dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None)
# 4) Filter & convert to final structure
final_dialogues = processor.filter_and_convert(dialogues)
# 5) Save final data
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = Path("processed_outputs")
output_dir.mkdir(parents=True, exist_ok=True)
out_file = output_dir / f"taskmaster_only_{timestamp}.json"
with open(out_file, 'w', encoding='utf-8') as f:
json.dump(final_dialogues, f, indent=2)
print(f"[Taskmaster Only] Kept {len(final_dialogues)} dialogues => {out_file}")
if __name__ == "__main__":
main()