|
""" |
|
CSC525 - Module 8 Option 2 - Joseph Armani |
|
Description and References in the README.md file. |
|
""" |
|
import json |
|
import tensorflow as tf |
|
from typing import List, Dict |
|
from pipeline_config import PipelineConfig |
|
from processing_pipeline import ProcessingPipeline |
|
from taskmaster_processor import TaskmasterProcessor |
|
from schema_guided_dialogue_processor import SchemaGuidedProcessor |
|
|
|
def combine_datasets(taskmaster_dialogues: List[Dict], |
|
schema_guided_dialogues: List[Dict]) -> List[Dict]: |
|
""" |
|
Combine dialogues from both datasets into a single list |
|
|
|
Args: |
|
taskmaster_dialogues: List of dialogues in pipeline format from Taskmaster |
|
schema_guided_dialogues: List of dialogues in pipeline format from Schema-Guided |
|
|
|
Returns: |
|
List[Dict]: Combined list of dialogues |
|
""" |
|
|
|
combined_dialogues = [] |
|
seen_ids = set() |
|
duplicate_count = 0 |
|
|
|
for dialogue in taskmaster_dialogues: |
|
dialogue_copy = dialogue.copy() |
|
dialogue_id = dialogue_copy['dialogue_id'] |
|
if dialogue_id in seen_ids: |
|
duplicate_count += 1 |
|
dialogue_id = f"taskmaster_{dialogue_id}" |
|
seen_ids.add(dialogue_id) |
|
dialogue_copy['dialogue_id'] = dialogue_id |
|
combined_dialogues.append(dialogue_copy) |
|
|
|
for dialogue in schema_guided_dialogues: |
|
dialogue_copy = dialogue.copy() |
|
dialogue_id = dialogue_copy['dialogue_id'] |
|
if dialogue_id in seen_ids: |
|
duplicate_count += 1 |
|
dialogue_id = f"schema_guided_{dialogue_id}" |
|
seen_ids.add(dialogue_id) |
|
dialogue_copy['dialogue_id'] = dialogue_id |
|
combined_dialogues.append(dialogue_copy) |
|
|
|
|
|
print(f"Combine Datasets: Found and resolved {duplicate_count} duplicate dialogue IDs.") |
|
print(f"Combine Datasets: Total dialogues combined: {len(combined_dialogues)}") |
|
|
|
return combined_dialogues |
|
|
|
def main(): |
|
|
|
config = PipelineConfig( |
|
min_length=1, |
|
max_length=512, |
|
batch_size=32 if tf.config.list_physical_devices('GPU') else 16, |
|
max_turns_per_dialogue=12, |
|
max_variations_per_turn=4, |
|
max_sampled_variations=2, |
|
context_window_size=4, |
|
max_complexity_threshold=100, |
|
use_cache=False, |
|
debug=True, |
|
allowed_speakers=['user', 'assistant'], |
|
required_fields=['dialogue_id', 'turns'] |
|
) |
|
|
|
try: |
|
|
|
max_examples = 5 |
|
|
|
|
|
print("Loading Taskmaster dataset") |
|
taskmaster_processor = TaskmasterProcessor(config, use_ontology=False) |
|
taskmaster_dialogues = taskmaster_processor.load_dataset('./datasets/taskmaster', max_examples=max_examples) |
|
taskmaster_pipeline_dialogues = taskmaster_processor.convert_to_pipeline_format(taskmaster_dialogues) |
|
print(f"Processed Taskmaster dialogues: {len(taskmaster_pipeline_dialogues)}") |
|
|
|
|
|
print("Loading Schema-Guided dataset") |
|
schema_dialogue_processor = SchemaGuidedProcessor(config) |
|
schema_dialogues = schema_dialogue_processor.load_dataset('./datasets/schema_guided', max_examples=max_examples) |
|
schema_pipeline_dialogues = schema_dialogue_processor.convert_to_pipeline_format(schema_dialogues) |
|
print(f"Processed Schema-Guided dialogues: {len(schema_pipeline_dialogues)}") |
|
|
|
|
|
print("Combining datasets") |
|
combined_dialogues = combine_datasets(taskmaster_pipeline_dialogues, schema_pipeline_dialogues) |
|
print(f"Combined Dialogues: {len(combined_dialogues)}") |
|
|
|
if not combined_dialogues: |
|
print("Combined dialogues are empty. Exiting.") |
|
return |
|
|
|
|
|
print("Processing combined dataset") |
|
pipeline = ProcessingPipeline(config) |
|
output_path = pipeline.process_dataset(combined_dialogues) |
|
print(f"Processing complete. Results saved to {output_path}") |
|
pipeline.cleanup() |
|
|
|
except Exception as e: |
|
print(f"Processing failed: {str(e)}") |
|
raise |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|