|
import json |
|
from datetime import datetime |
|
from pathlib import Path |
|
from taskmaster_processor import TaskmasterProcessor, RawDataProcessingConfig |
|
|
|
def main(): |
|
|
|
base_dir = "raw_datasets/taskmaster" |
|
config = RawDataProcessingConfig( |
|
debug=True, |
|
max_length=512, |
|
min_turns=4, |
|
min_user_words=3 |
|
) |
|
processor = TaskmasterProcessor(config) |
|
|
|
|
|
dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None) |
|
|
|
|
|
final_dialogues = processor.filter_and_convert(dialogues) |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
output_dir = Path("processed_outputs") |
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
out_file = output_dir / f"taskmaster_only_{timestamp}.json" |
|
|
|
with open(out_file, 'w', encoding='utf-8') as f: |
|
json.dump(final_dialogues, f, indent=2) |
|
|
|
print(f"[Taskmaster Only] Kept {len(final_dialogues)} dialogues => {out_file}") |
|
|
|
if __name__ == "__main__": |
|
main() |