File size: 1,187 Bytes
7a0020b
 
 
 
71ca212
 
7a0020b
 
 
 
 
e5be70f
7a0020b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import json
from datetime import datetime
from pathlib import Path

from data_augmentation.pipeline_config import PipelineConfig
from data_augmentation.taskmaster_processor import TaskmasterProcessor

def main():
    # 1) Setup config
    config = PipelineConfig(
        max_length=512,
        min_turns=4,
        min_user_words=3,
        debug=True
    )
    
    # 2) Instantiate processor
    base_dir = "datasets/taskmaster"
    processor = TaskmasterProcessor(config)
    
    # 3) Load raw dialogues
    dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None)
    
    # 4) Filter & convert to final structure
    final_dialogues = processor.filter_and_convert(dialogues)
    
    # 5) Save final data
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = Path("processed_outputs")
    output_dir.mkdir(parents=True, exist_ok=True)
    out_file = output_dir / f"taskmaster_only_{timestamp}.json"
    
    with open(out_file, 'w', encoding='utf-8') as f:
        json.dump(final_dialogues, f, indent=2)
    
    print(f"[Taskmaster Only] Kept {len(final_dialogues)} dialogues => {out_file}")

if __name__ == "__main__":
    main()