File size: 4,398 Bytes
3190e1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300fe5d
 
3190e1e
 
 
 
300fe5d
3190e1e
 
 
 
 
 
bc503de
3190e1e
 
 
 
bc503de
3190e1e
 
 
 
 
 
bc503de
3190e1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc503de
 
 
3190e1e
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
CSC525 - Module 8 Option 2 - Joseph Armani
Description and References in the README.md file.
"""
import json
import tensorflow as tf
from typing import List, Dict
from pipeline_config import PipelineConfig
from processing_pipeline import ProcessingPipeline
from taskmaster_processor import TaskmasterProcessor
from schema_guided_dialogue_processor import SchemaGuidedProcessor

def combine_datasets(taskmaster_dialogues: List[Dict], 
                    schema_guided_dialogues: List[Dict]) -> List[Dict]:
    """
    Combine dialogues from both datasets into a single list
    
    Args:
        taskmaster_dialogues: List of dialogues in pipeline format from Taskmaster
        schema_guided_dialogues: List of dialogues in pipeline format from Schema-Guided
    
    Returns:
        List[Dict]: Combined list of dialogues
    """
    # Ensure unique dialogue IDs
    combined_dialogues = []
    seen_ids = set()
    duplicate_count = 0  # Track duplicates for reporting
    
    for dialogue in taskmaster_dialogues:
        dialogue_copy = dialogue.copy()
        dialogue_id = dialogue_copy['dialogue_id']
        if dialogue_id in seen_ids:
            duplicate_count += 1
            dialogue_id = f"taskmaster_{dialogue_id}"
        seen_ids.add(dialogue_id)
        dialogue_copy['dialogue_id'] = dialogue_id
        combined_dialogues.append(dialogue_copy)
    
    for dialogue in schema_guided_dialogues:
        dialogue_copy = dialogue.copy()
        dialogue_id = dialogue_copy['dialogue_id']
        if dialogue_id in seen_ids:
            duplicate_count += 1
            dialogue_id = f"schema_guided_{dialogue_id}"
        seen_ids.add(dialogue_id)
        dialogue_copy['dialogue_id'] = dialogue_id
        combined_dialogues.append(dialogue_copy)
    
    # Log the results
    print(f"Combine Datasets: Found and resolved {duplicate_count} duplicate dialogue IDs.")
    print(f"Combine Datasets: Total dialogues combined: {len(combined_dialogues)}")
    
    return combined_dialogues

def main():
    # Configuration
    config = PipelineConfig(
        min_length=1,
        max_length=512,
        batch_size=32 if tf.config.list_physical_devices('GPU') else 16,
        max_turns_per_dialogue=12,
        max_variations_per_turn=4,
        max_sampled_variations=2,
        context_window_size=4,
        max_complexity_threshold=100,
        use_cache=False,
        debug=True,
        allowed_speakers=['user', 'assistant'],
        required_fields=['dialogue_id', 'turns']
    )

    try:
        # Set max_examples (Optional[int]) for testing
        max_examples = 5
        
        # Initialize and load Taskmaster dataset
        print("Loading Taskmaster dataset")
        taskmaster_processor = TaskmasterProcessor(config, use_ontology=False)
        taskmaster_dialogues = taskmaster_processor.load_dataset('./datasets/taskmaster', max_examples=max_examples)
        taskmaster_pipeline_dialogues = taskmaster_processor.convert_to_pipeline_format(taskmaster_dialogues)
        print(f"Processed Taskmaster dialogues: {len(taskmaster_pipeline_dialogues)}")
        
        # Initialize and load Schema-Guided dataset
        print("Loading Schema-Guided dataset")
        schema_dialogue_processor = SchemaGuidedProcessor(config)
        schema_dialogues = schema_dialogue_processor.load_dataset('./datasets/schema_guided', max_examples=max_examples)
        schema_pipeline_dialogues = schema_dialogue_processor.convert_to_pipeline_format(schema_dialogues)
        print(f"Processed Schema-Guided dialogues: {len(schema_pipeline_dialogues)}")
        
        # Combine datasets
        print("Combining datasets")
        combined_dialogues = combine_datasets(taskmaster_pipeline_dialogues, schema_pipeline_dialogues)
        print(f"Combined Dialogues: {len(combined_dialogues)}")
        
        if not combined_dialogues:
            print("Combined dialogues are empty. Exiting.")
            return
        
        # Process through augmentation pipeline
        print("Processing combined dataset")
        pipeline = ProcessingPipeline(config)
        output_path = pipeline.process_dataset(combined_dialogues)
        print(f"Processing complete. Results saved to {output_path}")
        pipeline.cleanup()
        
    except Exception as e:
        print(f"Processing failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()