File size: 2,597 Bytes
2183656 71ca212 2183656 71ca212 2183656 71ca212 2183656 71ca212 2183656 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import json
from pathlib import Path
import logging
from typing import List, Dict
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
"""
Standalone script to deduplicate dialogues from multiple JSON files.
"""
def load_json_file(file_path: str) -> List[Dict]:
"""Load and parse JSON file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except json.JSONDecodeError as e:
logger.error(f"Error parsing JSON from {file_path}: {e}")
return []
except Exception as e:
logger.error(f"Error reading file {file_path}: {e}")
return []
def combine_json_files(input_directory: str, output_file: str):
"""
Combine multiple JSON files and removing duplicate dialogues based on dialogue_id.
Args:
input_directory: Directory containing JSON files to process
output_file: Path to save the combined output
"""
# Track unique dialogues
dialogue_map = {}
duplicate_count = 0
# Process all JSON files in the directory
input_path = Path(input_directory)
for json_file in input_path.glob('*.json'):
logger.info(f"Processing {json_file}")
data = load_json_file(str(json_file))
# Process each dialogue in the file
for dialogue in data:
dialogue_id = dialogue.get('dialogue_id')
if not dialogue_id:
logger.warning(f"Found dialogue without ID in {json_file}")
continue
# Keep the first occurrence
if dialogue_id in dialogue_map:
duplicate_count += 1
logger.debug(f"Duplicate dialogue_id found: {dialogue_id}")
else:
dialogue_map[dialogue_id] = dialogue
# Convert the map of unique dialogues back to a list
unique_dialogues = list(dialogue_map.values())
# Save combined dialogues to a new file
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(unique_dialogues, f, indent=4)
logger.info(f"Successfully combined files. Found {duplicate_count} duplicates.")
logger.info(f"Total unique dialogues: {len(unique_dialogues)}")
except Exception as e:
logger.error(f"Error writing output file: {e}")
if __name__ == "__main__":
combine_json_files(
input_directory="/Users/joe/Desktop/Grad School/CSC525/CSC525_mod8_option2_joseph_armani/processed_outputs",
output_file="augmented_dialogues.json"
) |