Spaces:

mckabue
/

document-similarity-matching-using-visual-layout-features-archive

Build error

File size: 3,154 Bytes

ea5f6fe
 
 
 
 
 
 
410a3d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea5f6fe
3cc12b1
410a3d6
3cc12b1
ea5f6fe
410a3d6
ea5f6fe
410a3d6
 
ea5f6fe
 
410a3d6
 
 
 
 
 
 
 
 
 
ea5f6fe
3cc12b1
410a3d6
 
ea5f6fe
410a3d6
ea5f6fe
e2c51cc
410a3d6
 
 
 
92b2916
410a3d6
6cb8ba2
410a3d6
 
 
6cb8ba2
410a3d6
 
e2c51cc
 
ea5f6fe
e2c51cc
 
 
 
92b2916
e2c51cc
ea5f6fe
e2c51cc
410a3d6
e2c51cc
ea5f6fe
e2c51cc
410a3d6
e2c51cc
3cc12b1
410a3d6
 
 
 
 
ea5f6fe
 
 
 
 
410a3d6

# Modified based on https://github.com/akarazniewicz/cocosplit/blob/master/cocosplit.py

import json
import argparse
import funcy
from sklearn.model_selection import train_test_split

parser = argparse.ArgumentParser(
    description="Splits COCO annotations file into training and test sets."
)
parser.add_argument(
    "--annotation-path",
    metavar="coco_annotations",
    type=str,
    help="Path to COCO annotations file.",
)
parser.add_argument(
    "--train", type=str, help="Where to store COCO training annotations"
)
parser.add_argument("--test", type=str, help="Where to store COCO test annotations")
parser.add_argument(
    "--split-ratio",
    dest="split_ratio",
    type=float,
    required=True,
    help="A percentage of a split; a number in (0, 1)",
)
parser.add_argument(
    "--having-annotations",
    dest="having_annotations",
    action="store_true",
    help="Ignore all images without annotations. Keep only these with at least one annotation",
)


def save_coco(file, tagged_data):
    with open(file, "wt", encoding="UTF-8") as coco:
        json.dump(tagged_data, coco, indent=2, sort_keys=True)


def filter_annotations(annotations, images):
    image_ids = funcy.lmap(lambda i: int(i["id"]), images)
    return funcy.lfilter(lambda a: int(a["image_id"]) in image_ids, annotations)


def main(
    annotation_path,
    split_ratio,
    having_annotations,
    train_save_path,
    test_save_path,
    random_state=None,
):

    with open(annotation_path, "rt", encoding="UTF-8") as annotations:
        coco = json.load(annotations)

    images = coco["images"]
    annotations = coco["annotations"]

    ids_with_annotations = funcy.lmap(lambda a: int(a["image_id"]), annotations)

    # Images with annotations
    img_ann = funcy.lremove(lambda i: i["id"] not in ids_with_annotations, images)
    tr_ann, ts_ann = train_test_split(
        img_ann, train_size=split_ratio, random_state=random_state
    )

    img_wo_ann = funcy.lremove(lambda i: i["id"] in ids_with_annotations, images)
    if len(img_wo_ann) > 0:
        tr_wo_ann, ts_wo_ann = train_test_split(
            img_wo_ann, train_size=split_ratio, random_state=random_state
        )
    else:
        tr_wo_ann, ts_wo_ann = [], []  # Images without annotations

    if having_annotations:
        tr, ts = tr_ann, ts_ann

    else:
        # Merging the 2 image lists (i.e. with and without annotation)
        tr_ann.extend(tr_wo_ann)
        ts_ann.extend(ts_wo_ann)

        tr, ts = tr_ann, ts_ann

    # Train Data
    coco.update({"images": tr, "annotations": filter_annotations(annotations, tr)})
    save_coco(train_save_path, coco)

    # Test Data
    coco.update({"images": ts, "annotations": filter_annotations(annotations, ts)})
    save_coco(test_save_path, coco)

    print(
        "Saved {} entries in {} and {} in {}".format(
            len(tr), train_save_path, len(ts), test_save_path
        )
    )


if __name__ == "__main__":
    args = parser.parse_args()

    main(
        args.annotation_path,
        args.split_ratio,
        args.having_annotations,
        args.train,
        args.test,
        random_state=24,
    )