|
import json |
|
from pathlib import Path |
|
import logging |
|
from typing import List, Dict |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
""" |
|
Standalone script to deduplicate dialogues from multiple JSON files. |
|
""" |
|
|
|
def load_json_file(file_path: str) -> List[Dict]: |
|
"""Load and parse JSON file.""" |
|
try: |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
return json.load(f) |
|
except json.JSONDecodeError as e: |
|
logger.error(f"Error parsing JSON from {file_path}: {e}") |
|
return [] |
|
except Exception as e: |
|
logger.error(f"Error reading file {file_path}: {e}") |
|
return [] |
|
|
|
def combine_json_files(input_directory: str, output_file: str): |
|
""" |
|
Combine multiple JSON files and removing duplicate dialogues based on dialogue_id. |
|
Args: |
|
input_directory: Directory containing JSON files to process |
|
output_file: Path to save the combined output |
|
""" |
|
|
|
dialogue_map = {} |
|
duplicate_count = 0 |
|
|
|
|
|
input_path = Path(input_directory) |
|
for json_file in input_path.glob('*.json'): |
|
logger.info(f"Processing {json_file}") |
|
|
|
data = load_json_file(str(json_file)) |
|
|
|
|
|
for dialogue in data: |
|
dialogue_id = dialogue.get('dialogue_id') |
|
|
|
if not dialogue_id: |
|
logger.warning(f"Found dialogue without ID in {json_file}") |
|
continue |
|
|
|
|
|
if dialogue_id in dialogue_map: |
|
duplicate_count += 1 |
|
logger.debug(f"Duplicate dialogue_id found: {dialogue_id}") |
|
|
|
else: |
|
dialogue_map[dialogue_id] = dialogue |
|
|
|
|
|
unique_dialogues = list(dialogue_map.values()) |
|
|
|
|
|
try: |
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
json.dump(unique_dialogues, f, indent=4) |
|
logger.info(f"Successfully combined files. Found {duplicate_count} duplicates.") |
|
logger.info(f"Total unique dialogues: {len(unique_dialogues)}") |
|
except Exception as e: |
|
logger.error(f"Error writing output file: {e}") |
|
|
|
if __name__ == "__main__": |
|
combine_json_files( |
|
input_directory="/Users/joe/Desktop/Grad School/CSC525/CSC525_mod8_option2_joseph_armani/processed_outputs", |
|
output_file="augmented_dialogues.json" |
|
) |