Spaces:

mckabue
/

document-similarity-matching-using-visual-layout-features-archive

Build error

Shannon Shen commited on Feb 14, 2022

Commit

95245f5

Merge pull request #7 from XBeg9/fix/value-error

fix: ValueError: With n_samples=0, test_size=None and train_size=0.8, the resulting train set will be empty. Adjust any of the aforementioned parameters.

Files changed (1) hide show

utils/cocosplit.py +71 -44

utils/cocosplit.py CHANGED Viewed

@@ -5,48 +5,74 @@ import argparse
 import funcy
 from sklearn.model_selection import train_test_split
-parser = argparse.ArgumentParser(description='Splits COCO annotations file into training and test sets.')
-parser.add_argument('--annotation_path', metavar='coco_annotations', type=str,
-                    help='Path to COCO annotations file.')
-parser.add_argument('--train', type=str, help='Where to store COCO training annotations')
-parser.add_argument('--test', type=str, help='Where to store COCO test annotations')
-parser.add_argument('--split-ratio', dest='split_ratio', type=float, required=True,
-                    help="A percentage of a split; a number in (0, 1)")
-parser.add_argument('--having-annotations', dest='having_annotations', action='store_true',
-                    help='Ignore all images without annotations. Keep only these with at least one annotation')
 def save_coco(file, tagged_data):
-    with open(file, 'wt', encoding='UTF-8') as coco:
         json.dump(tagged_data, coco, indent=2, sort_keys=True)
 def filter_annotations(annotations, images):
-    image_ids = funcy.lmap(lambda i: int(i['id']), images)
-    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)
-def main(annotation_path,
-         split_ratio,
-         having_annotations,
-         train_save_path,
-         test_save_path,
-         random_state=None):
-    with open(annotation_path, 'rt', encoding='UTF-8') as annotations:
         coco = json.load(annotations)
-    images = coco['images']
-    annotations = coco['annotations']
-    ids_with_annotations = funcy.lmap(lambda a: int(a['image_id']), annotations)
     # Images with annotations
-    img_ann = funcy.lremove(lambda i: i['id'] not in ids_with_annotations, images)
-    tr_ann, ts_ann = train_test_split(img_ann, train_size=split_ratio,
-                                      random_state=random_state)
-    # Images without annotations
-    img_wo_ann = funcy.lremove(lambda i: i['id'] in ids_with_annotations, images)
-    tr_wo_ann, ts_wo_ann = train_test_split(img_wo_ann, train_size=split_ratio,
-                                            random_state=random_state)
     if having_annotations:
         tr, ts = tr_ann, ts_ann
@@ -59,27 +85,28 @@ def main(annotation_path,
         tr, ts = tr_ann, ts_ann
     # Train Data
-    coco.update({'images': tr,
-                 'annotations': filter_annotations(annotations, tr)})
     save_coco(train_save_path, coco)
     # Test Data
-    coco.update({'images': ts,
-                 'annotations': filter_annotations(annotations, ts)})
     save_coco(test_save_path, coco)
-    print("Saved {} entries in {} and {} in {}".format(len(tr),
-                                                       train_save_path,
-                                                       len(ts),
-                                                       test_save_path))
 if __name__ == "__main__":
     args = parser.parse_args()
-    main(args.annotation_path,
-         args.split_ratio,
-         args.having_annotations,
-         args.train,
-         args.test,
-         random_state=24)

 import funcy
 from sklearn.model_selection import train_test_split
+parser = argparse.ArgumentParser(
+    description="Splits COCO annotations file into training and test sets."
+)
+parser.add_argument(
+    "--annotation-path",
+    metavar="coco_annotations",
+    type=str,
+    help="Path to COCO annotations file.",
+)
+parser.add_argument(
+    "--train", type=str, help="Where to store COCO training annotations"
+)
+parser.add_argument("--test", type=str, help="Where to store COCO test annotations")
+parser.add_argument(
+    "--split-ratio",
+    dest="split_ratio",
+    type=float,
+    required=True,
+    help="A percentage of a split; a number in (0, 1)",
+)
+parser.add_argument(
+    "--having-annotations",
+    dest="having_annotations",
+    action="store_true",
+    help="Ignore all images without annotations. Keep only these with at least one annotation",
+)
 def save_coco(file, tagged_data):
+    with open(file, "wt", encoding="UTF-8") as coco:
         json.dump(tagged_data, coco, indent=2, sort_keys=True)
 def filter_annotations(annotations, images):
+    image_ids = funcy.lmap(lambda i: int(i["id"]), images)
+    return funcy.lfilter(lambda a: int(a["image_id"]) in image_ids, annotations)
+def main(
+    annotation_path,
+    split_ratio,
+    having_annotations,
+    train_save_path,
+    test_save_path,
+    random_state=None,
+):
+    with open(annotation_path, "rt", encoding="UTF-8") as annotations:
         coco = json.load(annotations)
+    images = coco["images"]
+    annotations = coco["annotations"]
+    ids_with_annotations = funcy.lmap(lambda a: int(a["image_id"]), annotations)
     # Images with annotations
+    img_ann = funcy.lremove(lambda i: i["id"] not in ids_with_annotations, images)
+    tr_ann, ts_ann = train_test_split(
+        img_ann, train_size=split_ratio, random_state=random_state
+    )
+    img_wo_ann = funcy.lremove(lambda i: i["id"] in ids_with_annotations, images)
+    if len(img_wo_ann) > 0:
+        tr_wo_ann, ts_wo_ann = train_test_split(
+            img_wo_ann, train_size=split_ratio, random_state=random_state
+        )
+    else:
+        tr_wo_ann, ts_wo_ann = [], []  # Images without annotations
     if having_annotations:
         tr, ts = tr_ann, ts_ann
         tr, ts = tr_ann, ts_ann
     # Train Data
+    coco.update({"images": tr, "annotations": filter_annotations(annotations, tr)})
     save_coco(train_save_path, coco)
     # Test Data
+    coco.update({"images": ts, "annotations": filter_annotations(annotations, ts)})
     save_coco(test_save_path, coco)
+    print(
+        "Saved {} entries in {} and {} in {}".format(
+            len(tr), train_save_path, len(ts), test_save_path
+        )
+    )
 if __name__ == "__main__":
     args = parser.parse_args()
+    main(
+        args.annotation_path,
+        args.split_ratio,
+        args.having_annotations,
+        args.train,
+        args.test,
+        random_state=24,
+    )