Spaces:

mckabue
/

document-similarity-matching-using-visual-layout-features-archive

Build error

App Files Files Community

loliipopshock commited on Jul 7, 2020

Commit

ea5f6fe

·

1 Parent(s): 0701c1d

Add the cocosplit script

Files changed (3) hide show

README.md +5 -0
utils/__init__.py +0 -0
utils/cocosplit.py +65 -0

README.md CHANGED Viewed

	@@ -1 +1,6 @@
1	# Scripts for training Layout Detection Models using Detectron2

 # Scripts for training Layout Detection Models using Detectron2
+## Reference
+- **[cocosplit](https://github.com/akarazniewicz/cocosplit)**  A script that splits the coco annotations.

utils/__init__.py ADDED Viewed

File without changes

utils/cocosplit.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Modified based on https://github.com/akarazniewicz/cocosplit/blob/master/cocosplit.py
+import json
+import argparse
+import funcy
+from sklearn.model_selection import train_test_split
+parser = argparse.ArgumentParser(description='Splits COCO annotations file into training and test sets.')
+parser.add_argument('annotations', metavar='coco_annotations', type=str,
+                    help='Path to COCO annotations file.')
+parser.add_argument('train', type=str, help='Where to store COCO training annotations')
+parser.add_argument('test', type=str, help='Where to store COCO test annotations')
+parser.add_argument('-s', dest='split_ratio', type=float, required=True,
+                    help="A percentage of a split; a number in (0, 1)")
+parser.add_argument('--having-annotations', dest='having_annotations', action='store_true',
+                    help='Ignore all images without annotations. Keep only these with at least one annotation')
+def save_coco(file, info, licenses, images, annotations, categories):
+    with open(file, 'wt', encoding='UTF-8') as coco:
+        json.dump({ 'info': info, 'licenses': licenses, 'images': images,
+            'annotations': annotations, 'categories': categories}, coco, indent=2, sort_keys=True)
+def filter_annotations(annotations, images):
+    image_ids = funcy.lmap(lambda i: int(i['id']), images)
+    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)
+def main(annotation_path,
+         split_ratio,
+         having_annotations,
+         train_save_path,
+         test_save_path,
+         random_state=None):
+    with open(annotation_path, 'rt', encoding='UTF-8') as annotations:
+        coco = json.load(annotations)
+        info = coco['info']
+        licenses = coco['licenses']
+        images = coco['images']
+        annotations = coco['annotations']
+        categories = coco['categories']
+        number_of_images = len(images)
+        images_with_annotations = funcy.lmap(lambda a: int(a['image_id']), annotations)
+        if having_annotations:
+            images = funcy.lremove(lambda i: i['id'] not in images_with_annotations, images)
+        x, y = train_test_split(images, train_size=split_ratio, random_state=random_state)
+        save_coco(train_save_path, info, licenses, x, filter_annotations(annotations, x), categories)
+        save_coco(test_save_path, info, licenses, y, filter_annotations(annotations, y), categories)
+        print("Saved {} entries in {} and {} in {}".format(len(x), train_save_path, len(y), test_save_path))
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.annotation_path,
+         args.split_ratio,
+         args.having_annotations,
+         args.train,
+         args.test,
+         random_state=24)