File size: 1,109 Bytes
7a0020b
 
 
c111c20
7a0020b
 
c111c20
64e7c31
c111c20
 
7a0020b
e5be70f
c111c20
7a0020b
 
 
c111c20
7a0020b
 
c111c20
7a0020b
 
c111c20
7a0020b
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import json
from datetime import datetime
from pathlib import Path
from taskmaster_processor import TaskmasterProcessor, RawDataProcessingConfig

def main():
    # Setup config and processor
    base_dir = "raw_datasets/taskmaster"
    config = RawDataProcessingConfig(
        debug=True,
        max_length=512,
        min_turns=4,
        min_user_words=3
    )
    processor = TaskmasterProcessor(config)
    
    # Load dialogues
    dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None)
    
    # Filter and convert dialogues
    final_dialogues = processor.filter_and_convert(dialogues)
    
    # Save processed dialogues
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = Path("processed_outputs")
    output_dir.mkdir(parents=True, exist_ok=True)
    out_file = output_dir / f"taskmaster_only_{timestamp}.json"
    
    with open(out_file, 'w', encoding='utf-8') as f:
        json.dump(final_dialogues, f, indent=2)
    
    print(f"[Taskmaster Only] Kept {len(final_dialogues)} dialogues => {out_file}")

if __name__ == "__main__":
    main()