JoeArmani
updates through 4th iteration
300fe5d
raw
history blame
4.4 kB
"""
CSC525 - Module 8 Option 2 - Joseph Armani
Description and References in the README.md file.
"""
import json
import tensorflow as tf
from typing import List, Dict
from pipeline_config import PipelineConfig
from processing_pipeline import ProcessingPipeline
from taskmaster_processor import TaskmasterProcessor
from schema_guided_dialogue_processor import SchemaGuidedProcessor
def combine_datasets(taskmaster_dialogues: List[Dict],
schema_guided_dialogues: List[Dict]) -> List[Dict]:
"""
Combine dialogues from both datasets into a single list
Args:
taskmaster_dialogues: List of dialogues in pipeline format from Taskmaster
schema_guided_dialogues: List of dialogues in pipeline format from Schema-Guided
Returns:
List[Dict]: Combined list of dialogues
"""
# Ensure unique dialogue IDs
combined_dialogues = []
seen_ids = set()
duplicate_count = 0 # Track duplicates for reporting
for dialogue in taskmaster_dialogues:
dialogue_copy = dialogue.copy()
dialogue_id = dialogue_copy['dialogue_id']
if dialogue_id in seen_ids:
duplicate_count += 1
dialogue_id = f"taskmaster_{dialogue_id}"
seen_ids.add(dialogue_id)
dialogue_copy['dialogue_id'] = dialogue_id
combined_dialogues.append(dialogue_copy)
for dialogue in schema_guided_dialogues:
dialogue_copy = dialogue.copy()
dialogue_id = dialogue_copy['dialogue_id']
if dialogue_id in seen_ids:
duplicate_count += 1
dialogue_id = f"schema_guided_{dialogue_id}"
seen_ids.add(dialogue_id)
dialogue_copy['dialogue_id'] = dialogue_id
combined_dialogues.append(dialogue_copy)
# Log the results
print(f"Combine Datasets: Found and resolved {duplicate_count} duplicate dialogue IDs.")
print(f"Combine Datasets: Total dialogues combined: {len(combined_dialogues)}")
return combined_dialogues
def main():
# Configuration
config = PipelineConfig(
min_length=1,
max_length=512,
batch_size=32 if tf.config.list_physical_devices('GPU') else 16,
max_turns_per_dialogue=12,
max_variations_per_turn=4,
max_sampled_variations=2,
context_window_size=4,
max_complexity_threshold=100,
use_cache=False,
debug=True,
allowed_speakers=['user', 'assistant'],
required_fields=['dialogue_id', 'turns']
)
try:
# Set max_examples (Optional[int]) for testing
max_examples = 5
# Initialize and load Taskmaster dataset
print("Loading Taskmaster dataset")
taskmaster_processor = TaskmasterProcessor(config, use_ontology=False)
taskmaster_dialogues = taskmaster_processor.load_dataset('./datasets/taskmaster', max_examples=max_examples)
taskmaster_pipeline_dialogues = taskmaster_processor.convert_to_pipeline_format(taskmaster_dialogues)
print(f"Processed Taskmaster dialogues: {len(taskmaster_pipeline_dialogues)}")
# Initialize and load Schema-Guided dataset
print("Loading Schema-Guided dataset")
schema_dialogue_processor = SchemaGuidedProcessor(config)
schema_dialogues = schema_dialogue_processor.load_dataset('./datasets/schema_guided', max_examples=max_examples)
schema_pipeline_dialogues = schema_dialogue_processor.convert_to_pipeline_format(schema_dialogues)
print(f"Processed Schema-Guided dialogues: {len(schema_pipeline_dialogues)}")
# Combine datasets
print("Combining datasets")
combined_dialogues = combine_datasets(taskmaster_pipeline_dialogues, schema_pipeline_dialogues)
print(f"Combined Dialogues: {len(combined_dialogues)}")
if not combined_dialogues:
print("Combined dialogues are empty. Exiting.")
return
# Process through augmentation pipeline
print("Processing combined dataset")
pipeline = ProcessingPipeline(config)
output_path = pipeline.process_dataset(combined_dialogues)
print(f"Processing complete. Results saved to {output_path}")
pipeline.cleanup()
except Exception as e:
print(f"Processing failed: {str(e)}")
raise
if __name__ == "__main__":
main()