File size: 1,109 Bytes
7a0020b c111c20 7a0020b c111c20 64e7c31 c111c20 7a0020b e5be70f c111c20 7a0020b c111c20 7a0020b c111c20 7a0020b c111c20 7a0020b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
import json
from datetime import datetime
from pathlib import Path
from taskmaster_processor import TaskmasterProcessor, RawDataProcessingConfig
def main():
# Setup config and processor
base_dir = "raw_datasets/taskmaster"
config = RawDataProcessingConfig(
debug=True,
max_length=512,
min_turns=4,
min_user_words=3
)
processor = TaskmasterProcessor(config)
# Load dialogues
dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None)
# Filter and convert dialogues
final_dialogues = processor.filter_and_convert(dialogues)
# Save processed dialogues
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = Path("processed_outputs")
output_dir.mkdir(parents=True, exist_ok=True)
out_file = output_dir / f"taskmaster_only_{timestamp}.json"
with open(out_file, 'w', encoding='utf-8') as f:
json.dump(final_dialogues, f, indent=2)
print(f"[Taskmaster Only] Kept {len(final_dialogues)} dialogues => {out_file}")
if __name__ == "__main__":
main() |