""" Copyright (c) 2024 The D-FINE Authors. All Rights Reserved. """ import argparse import json import os def update_image_paths(images, new_prefix): print("Updating image paths with new prefix...") for img in images: split = img["file_name"].split("/")[1:] img["file_name"] = os.path.join(new_prefix, *split) print("Image paths updated.") return images def create_split_annotations(original_annotations, split_image_ids, new_prefix, output_file): print(f"Creating split annotations for {output_file}...") new_images = [img for img in original_annotations["images"] if img["id"] in split_image_ids] print(f"Number of images selected: {len(new_images)}") if new_prefix is not None: new_images = update_image_paths(new_images, new_prefix) new_annotations = { "images": new_images, "annotations": [ ann for ann in original_annotations["annotations"] if ann["image_id"] in split_image_ids ], "categories": original_annotations["categories"], } print(f'Number of annotations selected: {len(new_annotations["annotations"])}') with open(output_file, "w") as f: json.dump(new_annotations, f) print(f"Annotations saved to {output_file}") def parse_arguments(): parser = argparse.ArgumentParser(description="Split and update dataset annotations.") parser.add_argument( "--base_dir", type=str, required=True, help="Base directory of the dataset, e.g., /data/Objects365/data", ) parser.add_argument( "--new_val_size", type=int, default=5000, help="Number of images to include in the new validation set (default: 5000)", ) parser.add_argument( "--output_suffix", type=str, default="new", help="Suffix to add to new annotation files (default: new)", ) return parser.parse_args() def main(): args = parse_arguments() base_dir = args.base_dir new_val_size = args.new_val_size output_suffix = args.output_suffix # Define paths based on the base directory original_train_ann_file = os.path.join(base_dir, "train", "zhiyuan_objv2_train.json") original_val_ann_file = os.path.join(base_dir, "val", "zhiyuan_objv2_val.json") new_val_ann_file = os.path.join(base_dir, "val", f"{output_suffix}_zhiyuan_objv2_val.json") new_train_ann_file = os.path.join( base_dir, "train", f"{output_suffix}_zhiyuan_objv2_train.json" ) # Check if original annotation files exist if not os.path.isfile(original_train_ann_file): print(f"Error: Training annotation file not found at {original_train_ann_file}") return if not os.path.isfile(original_val_ann_file): print(f"Error: Validation annotation file not found at {original_val_ann_file}") return # Load the original training and validation annotations print("Loading original training annotations...") with open(original_train_ann_file, "r") as f: train_annotations = json.load(f) print("Training annotations loaded.") print("Loading original validation annotations...") with open(original_val_ann_file, "r") as f: val_annotations = json.load(f) print("Validation annotations loaded.") # Extract image IDs from the original validation set print("Extracting image IDs from the validation set...") val_image_ids = [img["id"] for img in val_annotations["images"]] print(f"Total validation images: {len(val_image_ids)}") # Split image IDs for the new training and validation sets print( f"Splitting validation images into new validation set of size {new_val_size} and training set..." ) new_val_image_ids = val_image_ids[:new_val_size] new_train_image_ids = val_image_ids[new_val_size:] print(f"New validation set size: {len(new_val_image_ids)}") print(f"New training set size from validation images: {len(new_train_image_ids)}") # Create new validation annotation file print("Creating new validation annotations...") create_split_annotations(val_annotations, new_val_image_ids, None, new_val_ann_file) print("New validation annotations created.") # Combine the remaining validation images and annotations with the original training data print("Preparing new training images and annotations...") new_train_images = [ img for img in val_annotations["images"] if img["id"] in new_train_image_ids ] print(f"Number of images from validation to add to training: {len(new_train_images)}") new_train_images = update_image_paths(new_train_images, "images_from_val") new_train_annotations = [ ann for ann in val_annotations["annotations"] if ann["image_id"] in new_train_image_ids ] print(f"Number of annotations from validation to add to training: {len(new_train_annotations)}") # Add the original training images and annotations print("Adding original training images and annotations...") new_train_images.extend(train_annotations["images"]) new_train_annotations.extend(train_annotations["annotations"]) print(f"Total training images: {len(new_train_images)}") print(f"Total training annotations: {len(new_train_annotations)}") # Create a new training annotation dictionary print("Creating new training annotations dictionary...") new_train_annotations_dict = { "images": new_train_images, "annotations": new_train_annotations, "categories": train_annotations["categories"], } print("New training annotations dictionary created.") # Save the new training annotations print("Saving new training annotations...") with open(new_train_ann_file, "w") as f: json.dump(new_train_annotations_dict, f) print(f"New training annotations saved to {new_train_ann_file}") print("Processing completed successfully.") if __name__ == "__main__": main()