File size: 2,597 Bytes
2183656
 
 
 
 
 
 
 
71ca212
 
 
 
2183656
71ca212
2183656
 
 
 
 
 
 
 
 
 
 
 
71ca212
2183656
 
 
 
71ca212
2183656
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
from pathlib import Path
import logging
from typing import List, Dict

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

"""
Standalone script to deduplicate dialogues from multiple JSON files.
"""

def load_json_file(file_path: str) -> List[Dict]:
    """Load and parse JSON file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except json.JSONDecodeError as e:
        logger.error(f"Error parsing JSON from {file_path}: {e}")
        return []
    except Exception as e:
        logger.error(f"Error reading file {file_path}: {e}")
        return []

def combine_json_files(input_directory: str, output_file: str):
    """
    Combine multiple JSON files and removing duplicate dialogues based on dialogue_id.
    Args:
        input_directory: Directory containing JSON files to process
        output_file: Path to save the combined output
    """
    # Track unique dialogues
    dialogue_map = {}
    duplicate_count = 0
    
    # Process all JSON files in the directory
    input_path = Path(input_directory)
    for json_file in input_path.glob('*.json'):
        logger.info(f"Processing {json_file}")
        
        data = load_json_file(str(json_file))
        
        # Process each dialogue in the file
        for dialogue in data:
            dialogue_id = dialogue.get('dialogue_id')
            
            if not dialogue_id:
                logger.warning(f"Found dialogue without ID in {json_file}")
                continue
            
            # Keep the first occurrence
            if dialogue_id in dialogue_map:
                duplicate_count += 1
                logger.debug(f"Duplicate dialogue_id found: {dialogue_id}")
                
            else:
                dialogue_map[dialogue_id] = dialogue
    
    # Convert the map of unique dialogues back to a list
    unique_dialogues = list(dialogue_map.values())
    
    # Save combined dialogues to a new file
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(unique_dialogues, f, indent=4)
        logger.info(f"Successfully combined files. Found {duplicate_count} duplicates.")
        logger.info(f"Total unique dialogues: {len(unique_dialogues)}")
    except Exception as e:
        logger.error(f"Error writing output file: {e}")

if __name__ == "__main__":
    combine_json_files(
        input_directory="/Users/joe/Desktop/Grad School/CSC525/CSC525_mod8_option2_joseph_armani/processed_outputs",
        output_file="augmented_dialogues.json"
    )