Spaces:

doublejtoh
/

proxydet

Runtime error

App Files Files Community

joonhyun23452 commited on Feb 1, 2024

Commit

8075387

1 Parent(s): 5f752ee

open proxydet demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +56 -39
assets/beach.jpg +0 -0
assets/desk.jpg +0 -0
assets/pikachu.jpg +0 -0
configs/Base-C2_L_R5021k_640b64_4x.yaml +83 -0
configs/BoxSup-C2_Lbase_CLIP_R5021k_640b64_4x.yaml +7 -0
configs/ProxyDet_R50_Lbase_INL.yaml +59 -0
configs/ProxyDet_SwinB_Lbase_INL.yaml +51 -0
datasets/metadata/__init__.py +0 -0
demo.py +245 -0
packages.txt +3 -0
proxydet/__init__.py +18 -0
proxydet/cat_names.py +1 -0
proxydet/config.py +156 -0
proxydet/custom_solver.py +78 -0
proxydet/data/custom_build_augmentation.py +51 -0
proxydet/data/custom_dataset_dataloader.py +331 -0
proxydet/data/custom_dataset_mapper.py +280 -0
proxydet/data/datasets/cc.py +23 -0
proxydet/data/datasets/coco_zeroshot.py +121 -0
proxydet/data/datasets/imagenet.py +41 -0
proxydet/data/datasets/lvis_22k_categories.py +0 -0
proxydet/data/datasets/lvis_v1.py +155 -0
proxydet/data/datasets/objects365.py +770 -0
proxydet/data/datasets/oid.py +535 -0
proxydet/data/datasets/register_oid.py +122 -0
proxydet/data/tar_dataset.py +138 -0
proxydet/data/transforms/custom_augmentation_impl.py +60 -0
proxydet/data/transforms/custom_transform.py +114 -0
proxydet/evaluation/custom_coco_eval.py +124 -0
proxydet/evaluation/oideval.py +699 -0
proxydet/modeling/backbone/swintransformer.py +750 -0
proxydet/modeling/backbone/timm.py +221 -0
proxydet/modeling/debug.py +334 -0
proxydet/modeling/meta_arch/custom_rcnn.py +232 -0
proxydet/modeling/meta_arch/d2_deformable_detr.py +308 -0
proxydet/modeling/roi_heads/proxydet_fast_rcnn.py +618 -0
proxydet/modeling/roi_heads/proxydet_roi_heads.py +556 -0
proxydet/modeling/roi_heads/zero_shot_classifier.py +111 -0
proxydet/modeling/text/text_encoder.py +189 -0
proxydet/modeling/utils.py +54 -0
proxydet/predictor.py +295 -0
requirements.txt +0 -6
third_party/CenterNet2/.github/CODE_OF_CONDUCT.md +5 -0
third_party/CenterNet2/.github/CONTRIBUTING.md +68 -0
third_party/CenterNet2/.github/Detectron2-Logo-Horz.svg +1 -0
third_party/CenterNet2/.github/ISSUE_TEMPLATE.md +5 -0
third_party/CenterNet2/.github/ISSUE_TEMPLATE/bugs.md +38 -0
third_party/CenterNet2/.github/ISSUE_TEMPLATE/config.yml +17 -0
third_party/CenterNet2/.github/ISSUE_TEMPLATE/documentation.md +14 -0

app.py CHANGED Viewed

@@ -3,10 +3,14 @@ import cv2
 import os
 import gradio as gr
 import numpy as np
-from transformers import OwlViTProcessor, OwlViTForObjectDetection
-def setup():
     os.system("python3 -m pip install 'git+https://github.com/facebookresearch/detectron2.git'")
 # Use GPU if available
 if torch.cuda.is_available():
@@ -14,26 +18,42 @@ if torch.cuda.is_available():
 else:
     device = torch.device("cpu")
-model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
-model.eval()
-processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
-def query_image(img, text_queries, score_threshold):
-    text_queries = text_queries
-    text_queries = text_queries.split(",")
-    target_sizes = torch.Tensor([img.shape[:2]])
-    inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
     with torch.no_grad():
-        outputs = model(**inputs)
-    outputs.logits = outputs.logits.cpu()
-    outputs.pred_boxes = outputs.pred_boxes.cpu()
-    results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
-    boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
     font = cv2.FONT_HERSHEY_SIMPLEX
     for box, score, label in zip(boxes, scores, labels):
@@ -47,35 +67,32 @@ def query_image(img, text_queries, score_threshold):
                 y = box[3] + 25
             img = cv2.putText(
-                img, text_queries[label], (box[0], y), font, 1, (255,0,0), 2, cv2.LINE_AA
             )
     return img
 if __name__ == "__main__":
-    setup()
     description = """
-    Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlvit">OWL-ViT</a>,
-    introduced in <a href="https://arxiv.org/abs/2205.06230">Simple Open-Vocabulary Object Detection
-    with Vision Transformers</a>.
-    \n\nYou can use OWL-ViT to query images with text descriptions of any object.
-    To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for. You
-    can also use the score threshold slider to set a threshold to filter out low probability predictions.
-    \n\nOWL-ViT is trained on text templates,
-    hence you can get better predictions by querying the image with text templates used in training the original model: *"photo of a star-spangled banner"*,
-    *"image of a shoe"*. Refer to the <a href="https://arxiv.org/abs/2103.00020">CLIP</a> paper to see the full list of text templates used to augment the training data.
-    \n\n<a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb">Colab demo</a>
     """
     demo = gr.Interface(
         query_image,
-        inputs=[gr.Image(), "text", gr.Slider(0, 1, value=0.1)],
         outputs="image",
-        title="Zero-Shot Object Detection with OWL-ViT",
         description=description,
         examples=[
-            ["assets/astronaut.png", "human face, rocket, star-spangled banner, nasa badge", 0.11],
-            ["assets/coffee.png", "coffee mug, spoon, plate", 0.1],
-            ["assets/butterflies.jpeg", "orange butterfly", 0.3],
         ],
     )
-    demo.launch()

 import os
 import gradio as gr
 import numpy as np
+from argparse import Namespace
+try:
+    import detectron2
+except:
     os.system("python3 -m pip install 'git+https://github.com/facebookresearch/detectron2.git'")
+    import detectron2
+from demo import setup_cfg
+from proxydet.predictor import VisualizationDemo
 # Use GPU if available
 if torch.cuda.is_available():
 else:
     device = torch.device("cpu")
+# download metadata
+zs_weight_path = 'datasets/metadata/lvis_v1_clip_a+cname.npy'
+if not os.path.exists(zs_weight_path):
+    wget.download("https://github.com/facebookresearch/Detic/raw/main/datasets/metadata/lvis_v1_clip_a+cname.npy", out=zs_weight_path)
+args = Namespace(
+    base_cat_threshold=0.9,
+    confidence_threshold=0.0,
+    config_file='configs/ProxyDet_SwinB_Lbase_INL.yaml',
+    cpu=not torch.cuda.is_available(),
+    custom_vocabulary='headphone,webcam,paper,coffe',
+    input=['.assets/desk.jpg'],
+    opts=['MODEL.WEIGHTS', 'models/proxydet_swinb_w_inl.pth'],
+    output='out.jpg',
+    pred_all_class=False,
+    video_input=None,
+    vocabulary='custom',
+    webcam=None,
+    zeroshot_weight_path='datasets/metadata/lvis_v1_clip_a+cname.npy'
+)
+cfg = setup_cfg(args)
+ovd_demo = VisualizationDemo(cfg, args)
+def query_image(img, text_queries, score_threshold, base_alpha, novel_beta):
+    text_queries_split = text_queries.split(",")
+    ovd_demo.reset_classifier(text_queries)
+    ovd_demo.reset_base_cat_mask()
+    ovd_demo.predictor.model.roi_heads.cmm_base_alpha = base_alpha
+    ovd_demo.predictor.model.roi_heads.cmm_novel_beta = novel_beta
+    img_bgr = img[:, :, ::-1]
     with torch.no_grad():
+        predictions, visualized_output = ovd_demo.run_on_image(img_bgr)
+    output_instances = predictions["instances"].to(device)
+    boxes = output_instances.pred_boxes.tensor
+    scores = output_instances.scores
+    labels = output_instances.pred_classes.tolist()
     font = cv2.FONT_HERSHEY_SIMPLEX
     for box, score, label in zip(boxes, scores, labels):
                 y = box[3] + 25
             img = cv2.putText(
+                img, text_queries_split[label], (box[0], y), font, 1, (255,0,0), 2, cv2.LINE_AA
             )
     return img
 if __name__ == "__main__":
     description = """
+Gradio demo for ProxyDet, introduced in <a href="https://arxiv.org/abs/2312.07266">ProxyDet: Synthesizing Proxy Novel Classes via Classwise Mixup for Open-Vocabulary Object Detection</a>.
+\n\nYou can use ProxyDet to query images with text descriptions of any object.
+Simply upload an image and enter comma separated objects (e.g., "dog,cat,headphone") which you want to detect within the image.
+You can also use the score threshold slider to set a threshold to filter out low probability predictions.
     """
     demo = gr.Interface(
         query_image,
+        inputs=[gr.Image(), "text", gr.Slider(0, 1, value=0.1), gr.Slider(0, 1, value=0.15), gr.Slider(0, 1, value=0.35)],
         outputs="image",
+        title="Open-Vocabulary Object Detection with ProxyDet",
         description=description,
         examples=[
+            ["assets/desk.jpg", "headphone,webcam,paper,coffee", 0.11, 0.15, 0.35],
+            ["assets/beach.jpg", "person,kite", 0.1, 0.15, 0.35],
+            ["assets/pikachu.jpg", "pikachu,person", 0.15, 0.15, 0.35],
         ],
     )
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.environ["NSML_PORT1"]),
+        share=False,
+        debug=True,
+    )

assets/beach.jpg ADDED Viewed

assets/desk.jpg ADDED Viewed

assets/pikachu.jpg ADDED Viewed

configs/Base-C2_L_R5021k_640b64_4x.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+# Code Copied from https://github.com/facebookresearch/Detic/blob/main/configs/Base-C2_L_R5021k_640b64_4x.yaml
+MODEL:
+  META_ARCHITECTURE: "CustomRCNN"
+  MASK_ON: True
+  PROPOSAL_GENERATOR:
+    NAME: "CenterNet"
+  WEIGHTS: "models/resnet50_miil_21k.pkl"
+  BACKBONE:
+    NAME: build_p67_timm_fpn_backbone
+  TIMM:
+    BASE_NAME: resnet50_in21k
+  FPN:
+    IN_FEATURES: ["layer3", "layer4", "layer5"]
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.12, 57.375]
+  ROI_HEADS:
+    NAME: ProxydetCascadeROIHeads
+    IN_FEATURES: ["p3", "p4", "p5"]
+    IOU_THRESHOLDS: [0.6]
+    NUM_CLASSES: 1203
+    SCORE_THRESH_TEST: 0.02
+    NMS_THRESH_TEST: 0.5
+  ROI_BOX_CASCADE_HEAD:
+    IOUS: [0.6, 0.7, 0.8]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+    CLS_AGNOSTIC_BBOX_REG: True
+    MULT_PROPOSAL_SCORE: True
+    USE_SIGMOID_CE: True
+    USE_FED_LOSS: True
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+    CLS_AGNOSTIC_MASK: True
+  CENTERNET:
+    NUM_CLASSES: 1203
+    REG_WEIGHT: 1.
+    NOT_NORM_REG: True
+    ONLY_PROPOSAL: True
+    WITH_AGN_HM: True
+    INFERENCE_TH: 0.0001
+    PRE_NMS_TOPK_TRAIN: 4000
+    POST_NMS_TOPK_TRAIN: 2000
+    PRE_NMS_TOPK_TEST: 1000
+    POST_NMS_TOPK_TEST: 256
+    NMS_TH_TRAIN: 0.9
+    NMS_TH_TEST: 0.9
+    POS_WEIGHT: 0.5
+    NEG_WEIGHT: 0.5
+    IGNORE_HIGH_FP: 0.85
+DATASETS:
+  TRAIN: ("lvis_v1_train",)
+  TEST: ("lvis_v1_val",)
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
+  NUM_WORKERS: 8
+TEST:
+  DETECTIONS_PER_IMAGE: 300
+SOLVER:
+  LR_SCHEDULER_NAME: "WarmupCosineLR"
+  CHECKPOINT_PERIOD: 1000000000
+  WARMUP_ITERS: 10000
+  WARMUP_FACTOR: 0.0001
+  USE_CUSTOM_SOLVER: True
+  OPTIMIZER: "ADAMW"
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 64
+  BASE_LR: 0.0002
+  CLIP_GRADIENTS:
+    ENABLED: True
+INPUT:
+  FORMAT: RGB
+  CUSTOM_AUG: EfficientDetResizeCrop
+  TRAIN_SIZE: 640
+OUTPUT_DIR: "./output/ProxyDet/auto"
+EVAL_PROPOSAL_AR: False
+VERSION: 2
+FP16: True

configs/BoxSup-C2_Lbase_CLIP_R5021k_640b64_4x.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Code Copied from https://github.com/facebookresearch/Detic/blob/main/configs/BoxSup-C2_Lbase_CLIP_R5021k_640b64_4x.yaml
+_BASE_: "Base-C2_L_R5021k_640b64_4x.yaml"
+MODEL:
+  ROI_BOX_HEAD:
+    USE_ZEROSHOT_CLS: True
+DATASETS:
+  TRAIN: ("lvis_v1_train_norare",)

configs/ProxyDet_R50_Lbase_INL.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+# Code Adapted from https://github.com/facebookresearch/Detic/blob/main/configs/Detic_LbaseI_CLIP_R5021k_640b64_4x_ft4x_max-size.yaml
+_BASE_: "Base-C2_L_R5021k_640b64_4x.yaml"
+MODEL:
+  ROI_BOX_HEAD:
+    USE_ZEROSHOT_CLS: True
+    IMAGE_LABEL_LOSS: 'max_size'
+    USE_REGIONAL_EMBEDDING: True
+  ROI_HEADS:
+    BASE_CAT_MASK: "datasets/metadata/lvis_v1_base_cat_mask.npy"
+    CMM:
+      MIXUP_STAGE: [2]
+      MIXUP_STAGE_TEST: [2]
+      MIXUP_BETA: 1.0
+      LOSS: "l1"
+      LOSS_WEIGHT: 256.0
+      SEPARATED_BRANCH: True
+      BASE_ALPHA: 0.15
+      NOVEL_BETA: 0.35
+      USE_INL: False
+      PROTOTYPE: "obj_score"
+      PROTOTYPE_TEMP: 1.0
+      CLASSIFIER_TEMP: 1.0
+      USE_SIGMOID_CE: True
+  WEIGHTS: "models/BoxSup-C2_Lbase_CLIP_R5021k_640b64_4x.pth"
+SOLVER:
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 64
+  BASE_LR: 0.0002
+  WARMUP_ITERS: 1000
+  WARMUP_FACTOR: 0.001
+DATASETS:
+  TRAIN: ("lvis_v1_train_norare","imagenet_lvis_v1")
+DATALOADER:
+  SAMPLER_TRAIN: "MultiDatasetSampler"
+  DATASET_RATIO: [1, 4]
+  USE_DIFF_BS_SIZE: True
+  DATASET_BS: [8, 32]
+  DATASET_INPUT_SIZE: [640, 320]
+  USE_RFS: [True, False]
+  DATASET_INPUT_SCALE: [[0.1, 2.0], [0.5, 1.5]]
+  FILTER_EMPTY_ANNOTATIONS: False
+  MULTI_DATASET_GROUPING: True
+  DATASET_ANN: ['box', 'image']
+  NUM_WORKERS: 8
+WITH_IMAGE_LABELS: True

configs/ProxyDet_SwinB_Lbase_INL.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Code Adapted from https://github.com/facebookresearch/Detic/blob/main/configs/Detic_LbaseI_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml
+_BASE_: "Base-C2_L_R5021k_640b64_4x.yaml"
+MODEL:
+  ROI_BOX_HEAD:
+    USE_ZEROSHOT_CLS: True
+    IMAGE_LABEL_LOSS: 'max_size'
+    USE_REGIONAL_EMBEDDING: True
+  ROI_HEADS:
+    BASE_CAT_MASK: "datasets/metadata/lvis_v1_base_cat_mask.npy"
+    CMM:
+      MIXUP_STAGE: [2]
+      MIXUP_STAGE_TEST: [2]
+      MIXUP_BETA: 1.0
+      LOSS: "l1"
+      LOSS_WEIGHT: 256.0
+      SEPARATED_BRANCH: True
+      BASE_ALPHA: 0.15
+      NOVEL_BETA: 0.35
+      USE_INL: False
+      PROTOTYPE: "obj_score"
+      PROTOTYPE_TEMP: 1.0
+      CLASSIFIER_TEMP: 1.0
+      USE_SIGMOID_CE: True
+  BACKBONE:
+    NAME: build_swintransformer_fpn_backbone
+  SWIN:
+    SIZE: B-22k
+  FPN:
+    IN_FEATURES: ["swin1", "swin2", "swin3"]
+  WEIGHTS: "models/BoxSup-C2_Lbase_CLIP_SwinB_896b32_4x.pth"
+SOLVER:
+  MAX_ITER: 180000
+  IMS_PER_BATCH: 32
+  BASE_LR: 0.0001
+  WARMUP_ITERS: 1000
+  WARMUP_FACTOR: 0.001
+DATASETS:
+  TRAIN: ("lvis_v1_train_norare","imagenet_lvis_v1")
+DATALOADER:
+  SAMPLER_TRAIN: "MultiDatasetSampler"
+  DATASET_RATIO: [1, 4]
+  USE_DIFF_BS_SIZE: True
+  DATASET_BS: [4, 16]
+  DATASET_INPUT_SIZE: [896, 448]
+  USE_RFS: [True, False]
+  DATASET_INPUT_SCALE: [[0.1, 2.0], [0.5, 1.5]]
+  FILTER_EMPTY_ANNOTATIONS: False
+  MULTI_DATASET_GROUPING: True
+  DATASET_ANN: ['box', 'image']
+  NUM_WORKERS: 8
+WITH_IMAGE_LABELS: True

datasets/metadata/__init__.py ADDED Viewed

File without changes

demo.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+'''
+    Modifications Copyright (c) 2024-present NAVER Corp, Apache License v2.0
+    original source: https://github.com/facebookresearch/Detic/blob/main/demo.py
+'''
+import argparse
+import glob
+import multiprocessing as mp
+import numpy as np
+import os
+import tempfile
+import time
+import warnings
+import cv2
+import tqdm
+import sys
+import mss
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+from detectron2.engine.defaults import _highlight
+sys.path.insert(0, 'third_party/CenterNet2/')
+from centernet.config import add_centernet_config
+from proxydet.config import add_proxydet_config
+from proxydet.predictor import VisualizationDemo
+# Fake a video capture object OpenCV style - half width, half height of first screen using MSS
+class ScreenGrab:
+    def __init__(self):
+        self.sct = mss.mss()
+        m0 = self.sct.monitors[0]
+        self.monitor = {'top': 0, 'left': 0, 'width': m0['width'] / 2, 'height': m0['height'] / 2}
+    def read(self):
+        img =  np.array(self.sct.grab(self.monitor))
+        nf = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
+        return (True, nf)
+    def isOpened(self):
+        return True
+    def release(self):
+        return True
+# constants
+WINDOW_NAME = "ProxyDet"
+def setup_cfg(args):
+    cfg = get_cfg()
+    if args.cpu:
+        cfg.MODEL.DEVICE="cpu"
+    add_centernet_config(cfg)
+    add_proxydet_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    # Set score_threshold for builtin models
+    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
+    cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = 'rand' # load later
+    if not args.pred_all_class:
+        cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = True
+    cfg.freeze()
+    return cfg
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
+    parser.add_argument(
+        "--config-file",
+        default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", help="Take inputs from webcam.")
+    parser.add_argument("--cpu", action='store_true', help="Use CPU only.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+    parser.add_argument(
+        "--vocabulary",
+        default="lvis",
+        choices=['lvis', 'openimages', 'objects365', 'coco', 'custom'],
+        help="",
+    )
+    parser.add_argument(
+        "--custom_vocabulary",
+        default="",
+        help="",
+    )
+    parser.add_argument(
+        "--zeroshot_weight_path",
+        default=None,
+        help="zeroshot text embedding path used during training",
+    )
+    parser.add_argument("--pred_all_class", action='store_true')
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.5,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--base-cat-threshold",
+        type=float,
+        default=0.9,
+        help="Minimum score for similarity with trained base categories",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+def test_opencv_video_format(codec, file_ext):
+    with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
+        filename = os.path.join(dir, "test_file" + file_ext)
+        writer = cv2.VideoWriter(
+            filename=filename,
+            fourcc=cv2.VideoWriter_fourcc(*codec),
+            fps=float(30),
+            frameSize=(10, 10),
+            isColor=True,
+        )
+        [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
+        writer.release()
+        if os.path.isfile(filename):
+            return True
+        return False
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+    cfg = setup_cfg(args)
+    print(_highlight(cfg.dump(), ".yaml"))
+    demo = VisualizationDemo(cfg, args)
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            img = read_image(path, format="BGR")
+            start_time = time.time()
+            predictions, visualized_output = demo.run_on_image(img)
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(predictions["instances"]))
+                    if "instances" in predictions
+                    else "finished",
+                    time.time() - start_time,
+                )
+            )
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                visualized_output.save(out_filename)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    elif args.webcam:
+        assert args.input is None, "Cannot have both --input and --webcam!"
+        assert args.output is None, "output not yet supported with --webcam!"
+        if args.webcam == "screen":
+            cam = ScreenGrab()
+        else:
+            cam = cv2.VideoCapture(int(args.webcam))
+        for vis in tqdm.tqdm(demo.run_on_video(cam)):
+            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+            cv2.imshow(WINDOW_NAME, vis)
+            if cv2.waitKey(1) == 27:
+                break  # esc to quit
+        cam.release()
+        cv2.destroyAllWindows()
+    elif args.video_input:
+        video = cv2.VideoCapture(args.video_input)
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames_per_second = video.get(cv2.CAP_PROP_FPS)
+        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        basename = os.path.basename(args.video_input)
+        codec, file_ext = (
+            ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
+        )
+        if codec == ".mp4v":
+            warnings.warn("x264 codec not available, switching to mp4v")
+        if args.output:
+            if os.path.isdir(args.output):
+                output_fname = os.path.join(args.output, basename)
+                output_fname = os.path.splitext(output_fname)[0] + file_ext
+            else:
+                output_fname = args.output
+            assert not os.path.isfile(output_fname), output_fname
+            output_file = cv2.VideoWriter(
+                filename=output_fname,
+                # some installation of opencv may not support x264 (due to its license),
+                # you can try other format (e.g. MPEG)
+                fourcc=cv2.VideoWriter_fourcc(*codec),
+                fps=float(frames_per_second),
+                frameSize=(width, height),
+                isColor=True,
+            )
+        assert os.path.isfile(args.video_input)
+        for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
+            if args.output:
+                output_file.write(vis_frame)
+            else:
+                cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
+                cv2.imshow(basename, vis_frame)
+                if cv2.waitKey(1) == 27:
+                    break  # esc to quit
+        video.release()
+        if args.output:
+            output_file.release()
+        else:
+            cv2.destroyAllWindows()

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+ffmpeg
+libsm6
+libxext6

proxydet/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from .modeling.meta_arch import custom_rcnn
+from .modeling.roi_heads import proxydet_roi_heads
+from .modeling.backbone import swintransformer
+from .modeling.backbone import timm
+from .data.datasets import lvis_v1
+from .data.datasets import imagenet
+from .data.datasets import cc
+from .data.datasets import objects365
+from .data.datasets import oid
+from .data.datasets import coco_zeroshot
+try:
+    from .modeling.meta_arch import d2_deformable_detr
+except:
+    pass

proxydet/cat_names.py ADDED Viewed

	@@ -0,0 +1 @@

+ lvis_cat_names = ['aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock', 'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet', 'antenna', 'apple', 'applesauce', 'apricot', 'apron', 'aquarium', 'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor', 'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer', 'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy', 'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel', 'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon', 'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo', 'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow', 'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap', 'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)', 'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)', 'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie', 'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper', 'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt', 'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor', 'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath', 'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card', 'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket', 'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry', 'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg', 'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase', 'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle', 'bottle_opener', 'bouquet', 'bow_(weapon)', 'bow_(decorative_ribbons)', 'bow-tie', 'bowl', 'pipe_bowl', 'bowler_hat', 'bowling_ball', 'box', 'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere', 'bread-bin', 'bread', 'breechcloth', 'bridal_gown', 'briefcase', 'broccoli', 'broach', 'broom', 'brownie', 'brussels_sprouts', 'bubble_gum', 'bucket', 'horse_buggy', 'bull', 'bulldog', 'bulldozer', 'bullet_train', 'bulletin_board', 'bulletproof_vest', 'bullhorn', 'bun', 'bunk_bed', 'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car', 'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf', 'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)', 'can', 'can_opener', 'candle', 'candle_holder', 'candy_bar', 'candy_cane', 'walking_cane', 'canister', 'canoe', 'cantaloup', 'canteen', 'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino', 'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car', 'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship', 'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton', 'cash_register', 'casserole', 'cassette', 'cast', 'cat', 'cauliflower', 'cayenne_(spice)', 'CD_player', 'celery', 'cellular_telephone', 'chain_mail', 'chair', 'chaise_longue', 'chalice', 'chandelier', 'chap', 'checkbook', 'checkerboard', 'cherry', 'chessboard', 'chicken_(animal)', 'chickpea', 'chili_(vegetable)', 'chime', 'chinaware', 'crisp_(potato_chip)', 'poker_chip', 'chocolate_bar', 'chocolate_cake', 'chocolate_milk', 'chocolate_mousse', 'choker', 'chopping_board', 'chopstick', 'Christmas_tree', 'slide', 'cider', 'cigar_box', 'cigarette', 'cigarette_case', 'cistern', 'clarinet', 'clasp', 'cleansing_agent', 'cleat_(for_securing_rope)', 'clementine', 'clip', 'clipboard', 'clippers_(for_plants)', 'cloak', 'clock', 'clock_tower', 'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster', 'coat', 'coat_hanger', 'coatrack', 'cock', 'cockroach', 'cocoa_(beverage)', 'coconut', 'coffee_maker', 'coffee_table', 'coffeepot', 'coil', 'coin', 'colander', 'coleslaw', 'coloring_material', 'combination_lock', 'pacifier', 'comic_book', 'compass', 'computer_keyboard', 'condiment', 'cone', 'control', 'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie', 'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)', 'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet', 'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall', 'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker', 'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib', 'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown', 'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch', 'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup', 'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain', 'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard', 'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk', 'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', 'tux', 'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher', 'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup', 'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin', 'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly', 'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit', 'dresser', 'drill', 'drone', 'dropper', 'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling', 'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan', 'eagle', 'earphone', 'earplug', 'earring', 'easel', 'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater', 'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk', 'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan', 'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)', 'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)', 'fire_alarm', 'fire_engine', 'fire_extinguisher', 'fire_hose', 'fireplace', 'fireplug', 'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl', 'fishing_rod', 'flag', 'flagpole', 'flamingo', 'flannel', 'flap', 'flash', 'flashlight', 'fleece', 'flip-flop_(sandal)', 'flipper_(footwear)', 'flower_arrangement', 'flute_glass', 'foal', 'folding_chair', 'food_processor', 'football_(American)', 'football_helmet', 'footstool', 'fork', 'forklift', 'freight_car', 'French_toast', 'freshener', 'frisbee', 'frog', 'fruit_juice', 'frying_pan', 'fudge', 'funnel', 'futon', 'gag', 'garbage', 'garbage_truck', 'garden_hose', 'gargle', 'gargoyle', 'garlic', 'gasmask', 'gazelle', 'gelatin', 'gemstone', 'generator', 'giant_panda', 'gift_wrap', 'ginger', 'giraffe', 'cincture', 'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles', 'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose', 'gorilla', 'gourd', 'grape', 'grater', 'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle', 'grill', 'grits', 'grizzly', 'grocery_bag', 'guitar', 'gull', 'gun', 'hairbrush', 'hairnet', 'hairpin', 'halter_top', 'ham', 'hamburger', 'hammer', 'hammock', 'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel', 'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw', 'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil', 'headband', 'headboard', 'headlight', 'headscarf', 'headset', 'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet', 'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog', 'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah', 'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce', 'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear', 'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate', 'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board', 'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey', 'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak', 'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono', 'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit', 'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)', 'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)', 'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard', 'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather', 'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade', 'lettuce', 'license_plate', 'life_buoy', 'life_jacket', 'lightbulb', 'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor', 'lizard', 'log', 'lollipop', 'speaker_(stero_equipment)', 'loveseat', 'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)', 'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange', 'manger', 'manhole', 'map', 'marker', 'martini', 'mascot', 'mashed_potato', 'masher', 'mask', 'mast', 'mat_(gym_equipment)', 'matchbox', 'mattress', 'measuring_cup', 'measuring_stick', 'meatball', 'medicine', 'melon', 'microphone', 'microscope', 'microwave_oven', 'milestone', 'milk', 'milk_can', 'milkshake', 'minivan', 'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)', 'money', 'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor', 'motor_scooter', 'motor_vehicle', 'motorcycle', 'mound_(baseball)', 'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom', 'music_stool', 'musical_instrument', 'nailfile', 'napkin', 'neckerchief', 'necklace', 'necktie', 'needle', 'nest', 'newspaper', 'newsstand', 'nightshirt', 'nosebag_(for_animals)', 'noseband_(for_animals)', 'notebook', 'notepad', 'nut', 'nutcracker', 'oar', 'octopus_(food)', 'octopus_(animal)', 'oil_lamp', 'olive_oil', 'omelet', 'onion', 'orange_(fruit)', 'orange_juice', 'ostrich', 'ottoman', 'oven', 'overalls_(clothing)', 'owl', 'packet', 'inkpad', 'pad', 'paddle', 'padlock', 'paintbrush', 'painting', 'pajamas', 'palette', 'pan_(for_cooking)', 'pan_(metal_container)', 'pancake', 'pantyhose', 'papaya', 'paper_plate', 'paper_towel', 'paperback_book', 'paperweight', 'parachute', 'parakeet', 'parasail_(sports)', 'parasol', 'parchment', 'parka', 'parking_meter', 'parrot', 'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport', 'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter', 'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg', 'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box', 'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)', 'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet', 'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano', 'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow', 'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball', 'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)', 'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat', 'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)', 'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)', 'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)', 'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', 'potato', 'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', 'pretzel', 'printer', 'projectile_(weapon)', 'projector', 'propeller', 'prune', 'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher', 'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit', 'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', 'radish', 'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat', 'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt', 'recliner', 'record_player', 'reflector', 'remote_control', 'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map', 'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade', 'rolling_pin', 'root_beer', 'router_(computer_equipment)', 'rubber_band', 'runner_(carpet)', 'plastic_bag', 'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin', 'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)', 'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)', 'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse', 'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf', 'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver', 'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane', 'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark', 'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl', 'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt', 'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass', 'shoulder_bag', 'shovel', 'shower_head', 'shower_cap', 'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink', 'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole', 'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)', 'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman', 'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball', 'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon', 'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)', 'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish', 'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)', 'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish', 'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel', 'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', 'stirrer', 'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove', 'strainer', 'strap', 'straw_(for_drinking)', 'strawberry', 'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer', 'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower', 'sunglasses', 'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants', 'sweatband', 'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit', 'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table', 'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight', 'tambourine', 'army_tank', 'tank_(storage_vessel)', 'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure', 'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup', 'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth', 'telephone_pole', 'telephoto_lens', 'television_camera', 'television_set', 'tennis_ball', 'tennis_racket', 'tequila', 'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread', 'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', 'tinfoil', 'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', 'toaster_oven', 'toilet', 'toilet_tissue', 'tomato', 'tongs', 'toolbox', 'toothbrush', 'toothpaste', 'toothpick', 'cover', 'tortilla', 'tow_truck', 'towel', 'towel_rack', 'toy', 'tractor_(farm_equipment)', 'traffic_light', 'dirt_bike', 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline', 'tray', 'trench_coat', 'triangle_(musical_instrument)', 'tricycle', 'tripod', 'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat', 'turban', 'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)', 'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn', 'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest', 'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture', 'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick', 'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe', 'washbasin', 'automatic_washer', 'watch', 'water_bottle', 'water_cooler', 'water_faucet', 'water_heater', 'water_jug', 'water_gun', 'water_scooter', 'water_ski', 'water_tower', 'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake', 'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream', 'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)', 'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket', 'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon', 'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt', 'yoke_(animal_equipment)', 'zebra', 'zucchini']

proxydet/config.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+'''
+    Modifications Copyright (c) 2024-present NAVER Corp, Apache License v2.0
+    original source: https://github.com/facebookresearch/Detic/blob/main/detic/config.py
+'''
+from detectron2.config import CfgNode as CN
+def add_proxydet_config(cfg):
+    _C = cfg
+    _C.WITH_IMAGE_LABELS = False # Turn on co-training with classification data
+    # Open-vocabulary classifier
+    _C.MODEL.ROI_BOX_HEAD.USE_ZEROSHOT_CLS = False # Use fixed classifier for open-vocabulary detection
+    _C.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = 'datasets/metadata/lvis_v1_clip_a+cname.npy'
+    _C.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_DIM = 512
+    _C.MODEL.ROI_BOX_HEAD.NORM_WEIGHT = True
+    _C.MODEL.ROI_BOX_HEAD.NORM_TEMP = 50.0
+    _C.MODEL.ROI_BOX_HEAD.IGNORE_ZERO_CATS = False
+    _C.MODEL.ROI_BOX_HEAD.USE_BIAS = 0.0 # >= 0: not use
+    _C.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE = False # CenterNet2
+    _C.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE = False
+    _C.MODEL.ROI_BOX_HEAD.PRIOR_PROB = 0.01
+    _C.MODEL.ROI_BOX_HEAD.USE_FED_LOSS = False # Federated Loss
+    _C.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH = \
+        'datasets/metadata/lvis_v1_train_cat_info.json'
+    _C.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CAT = 50
+    _C.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT = 0.5
+    # Classification data configs
+    _C.MODEL.ROI_BOX_HEAD.IMAGE_LABEL_LOSS = 'max_size' # max, softmax, sum
+    _C.MODEL.ROI_BOX_HEAD.IMAGE_LOSS_WEIGHT = 0.1
+    _C.MODEL.ROI_BOX_HEAD.IMAGE_BOX_SIZE = 1.0
+    _C.MODEL.ROI_BOX_HEAD.ADD_IMAGE_BOX = False # Used for image-box loss and caption loss
+    _C.MODEL.ROI_BOX_HEAD.WS_NUM_PROPS = 128 # num proposals for image-labeled data
+    _C.MODEL.ROI_BOX_HEAD.WITH_SOFTMAX_PROP = False # Used for WSDDN
+    _C.MODEL.ROI_BOX_HEAD.CAPTION_WEIGHT = 1.0 # Caption loss weight
+    _C.MODEL.ROI_BOX_HEAD.NEG_CAP_WEIGHT = 0.125 # Caption loss hyper-parameter
+    _C.MODEL.ROI_BOX_HEAD.ADD_FEATURE_TO_PROP = False # Used for WSDDN
+    _C.MODEL.ROI_BOX_HEAD.SOFTMAX_WEAK_LOSS = False # Used when USE_SIGMOID_CE is False
+    _C.MODEL.ROI_HEADS.MASK_WEIGHT = 1.0
+    _C.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = False # For demo only
+    # Class-wise Multi-Modal Mixup
+    _C.MODEL.ROI_BOX_HEAD.USE_REGIONAL_EMBEDDING = False
+    _C.MODEL.ROI_HEADS.BASE_CAT_MASK = "datasets/metadata/lvis_v1_base_cat_mask.npy"
+    _C.MODEL.ROI_HEADS.CMM = CN()
+    _C.MODEL.ROI_HEADS.CMM.MIXUP_STAGE = []
+    _C.MODEL.ROI_HEADS.CMM.MIXUP_STAGE_TEST = None
+    _C.MODEL.ROI_HEADS.CMM.MIXUP_BETA = 1.0
+    _C.MODEL.ROI_HEADS.CMM.LOSS = "l1"
+    _C.MODEL.ROI_HEADS.CMM.LOSS_WEIGHT = 1.0
+    _C.MODEL.ROI_HEADS.CMM.SEPARATED_BRANCH = False
+    _C.MODEL.ROI_HEADS.CMM.BASE_ALPHA = 0.5
+    _C.MODEL.ROI_HEADS.CMM.NOVEL_BETA = 0.5
+    _C.MODEL.ROI_HEADS.CMM.USE_INL = False
+    _C.MODEL.ROI_HEADS.CMM.PROTOTYPE = "center"
+    _C.MODEL.ROI_HEADS.CMM.PROTOTYPE_TEMP = 1.0
+    _C.MODEL.ROI_HEADS.CMM.CLASSIFIER_TEMP = None
+    _C.MODEL.ROI_HEADS.CMM.USE_SIGMOID_CE = True
+    # Caption losses
+    _C.MODEL.CAP_BATCH_RATIO = 4 # Ratio between detection data and caption data
+    _C.MODEL.WITH_CAPTION = False
+    _C.MODEL.SYNC_CAPTION_BATCH = False # synchronize across GPUs to enlarge # "classes"
+    # dynamic class sampling when training with 21K classes
+    _C.MODEL.DYNAMIC_CLASSIFIER = False
+    _C.MODEL.NUM_SAMPLE_CATS = 50
+    # Different classifiers in testing, used in cross-dataset evaluation
+    _C.MODEL.RESET_CLS_TESTS = False
+    _C.MODEL.TEST_CLASSIFIERS = []
+    _C.MODEL.TEST_NUM_CLASSES = []
+    # Backbones
+    _C.MODEL.SWIN = CN()
+    _C.MODEL.SWIN.SIZE = 'T' # 'T', 'S', 'B'
+    _C.MODEL.SWIN.USE_CHECKPOINT = False
+    _C.MODEL.SWIN.OUT_FEATURES = (1, 2, 3) # FPN stride 8 - 32
+    _C.MODEL.TIMM = CN()
+    _C.MODEL.TIMM.BASE_NAME = 'resnet50'
+    _C.MODEL.TIMM.OUT_LEVELS = (3, 4, 5)
+    _C.MODEL.TIMM.NORM = 'FrozenBN'
+    _C.MODEL.TIMM.FREEZE_AT = 0
+    _C.MODEL.TIMM.PRETRAINED = False
+    _C.MODEL.DATASET_LOSS_WEIGHT = []
+    # Multi-dataset dataloader
+    _C.DATALOADER.DATASET_RATIO = [1, 1] # sample ratio
+    _C.DATALOADER.USE_RFS = [False, False]
+    _C.DATALOADER.MULTI_DATASET_GROUPING = False # Always true when multi-dataset is enabled
+    _C.DATALOADER.DATASET_ANN = ['box', 'box'] # Annotation type of each dataset
+    _C.DATALOADER.USE_DIFF_BS_SIZE = False # Use different batchsize for each dataset
+    _C.DATALOADER.DATASET_BS = [8, 32] # Used when USE_DIFF_BS_SIZE is on
+    _C.DATALOADER.DATASET_INPUT_SIZE = [896, 384] # Used when USE_DIFF_BS_SIZE is on
+    _C.DATALOADER.DATASET_INPUT_SCALE = [(0.1, 2.0), (0.5, 1.5)] # Used when USE_DIFF_BS_SIZE is on
+    _C.DATALOADER.DATASET_MIN_SIZES = [(640, 800), (320, 400)] # Used when USE_DIFF_BS_SIZE is on
+    _C.DATALOADER.DATASET_MAX_SIZES = [1333, 667] # Used when USE_DIFF_BS_SIZE is on
+    _C.DATALOADER.USE_TAR_DATASET = False # for ImageNet-21K, directly reading from unziped files
+    _C.DATALOADER.TARFILE_PATH = 'datasets/imagenet/metadata-22k/tar_files.npy'
+    _C.DATALOADER.TAR_INDEX_DIR = 'datasets/imagenet/metadata-22k/tarindex_npy'
+    _C.SOLVER.USE_CUSTOM_SOLVER = False
+    _C.SOLVER.OPTIMIZER = 'SGD'
+    _C.SOLVER.BACKBONE_MULTIPLIER = 1.0 # Used in DETR
+    _C.SOLVER.CUSTOM_MULTIPLIER = 1.0 # Used in DETR
+    _C.SOLVER.CUSTOM_MULTIPLIER_NAME = [] # Used in DETR
+    # Deformable DETR
+    _C.MODEL.DETR = CN()
+    _C.MODEL.DETR.NUM_CLASSES = 80
+    _C.MODEL.DETR.FROZEN_WEIGHTS = '' # For Segmentation
+    _C.MODEL.DETR.GIOU_WEIGHT = 2.0
+    _C.MODEL.DETR.L1_WEIGHT = 5.0
+    _C.MODEL.DETR.DEEP_SUPERVISION = True
+    _C.MODEL.DETR.NO_OBJECT_WEIGHT = 0.1
+    _C.MODEL.DETR.CLS_WEIGHT = 2.0
+    _C.MODEL.DETR.NUM_FEATURE_LEVELS = 4
+    _C.MODEL.DETR.TWO_STAGE = False
+    _C.MODEL.DETR.WITH_BOX_REFINE = False
+    _C.MODEL.DETR.FOCAL_ALPHA = 0.25
+    _C.MODEL.DETR.NHEADS = 8
+    _C.MODEL.DETR.DROPOUT = 0.1
+    _C.MODEL.DETR.DIM_FEEDFORWARD = 2048
+    _C.MODEL.DETR.ENC_LAYERS = 6
+    _C.MODEL.DETR.DEC_LAYERS = 6
+    _C.MODEL.DETR.PRE_NORM = False
+    _C.MODEL.DETR.HIDDEN_DIM = 256
+    _C.MODEL.DETR.NUM_OBJECT_QUERIES = 100
+    _C.MODEL.DETR.USE_FED_LOSS = False
+    _C.MODEL.DETR.WEAK_WEIGHT = 0.1
+    _C.INPUT.CUSTOM_AUG = ''
+    _C.INPUT.TRAIN_SIZE = 640
+    _C.INPUT.TEST_SIZE = 640
+    _C.INPUT.SCALE_RANGE = (0.1, 2.)
+    # 'default' for fixed short/ long edge, 'square' for max size=INPUT.SIZE
+    _C.INPUT.TEST_INPUT_TYPE = 'default'
+    _C.FIND_UNUSED_PARAM = True
+    _C.EVAL_PRED_AR = False
+    _C.EVAL_PROPOSAL_AR = False
+    _C.EVAL_CAT_SPEC_AR = False
+    _C.IS_DEBUG = False
+    _C.QUICK_DEBUG = False
+    _C.FP16 = False
+    _C.EVAL_AP_FIX = False
+    _C.GEN_PSEDO_LABELS = False
+    _C.SAVE_DEBUG_PATH = 'output/save_debug/'
+    _C.EVAL_START = 0

proxydet/custom_solver.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from enum import Enum
+import itertools
+from typing import Any, Callable, Dict, Iterable, List, Set, Type, Union
+import torch
+from detectron2.config import CfgNode
+from detectron2.solver.build import maybe_add_gradient_clipping
+def match_name_keywords(n, name_keywords):
+    out = False
+    for b in name_keywords:
+        if b in n:
+            out = True
+            break
+    return out
+def build_custom_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer:
+    """
+    Build an optimizer from config.
+    """
+    params: List[Dict[str, Any]] = []
+    memo: Set[torch.nn.parameter.Parameter] = set()
+    custom_multiplier_name = cfg.SOLVER.CUSTOM_MULTIPLIER_NAME
+    optimizer_type = cfg.SOLVER.OPTIMIZER
+    for key, value in model.named_parameters(recurse=True):
+        if not value.requires_grad:
+            continue
+        # Avoid duplicating parameters
+        if value in memo:
+            continue
+        memo.add(value)
+        lr = cfg.SOLVER.BASE_LR
+        weight_decay = cfg.SOLVER.WEIGHT_DECAY
+        if "backbone" in key:
+            lr = lr * cfg.SOLVER.BACKBONE_MULTIPLIER
+        if match_name_keywords(key, custom_multiplier_name):
+            lr = lr * cfg.SOLVER.CUSTOM_MULTIPLIER
+            print('Costum LR', key, lr)
+        param = {"params": [value], "lr": lr}
+        if optimizer_type != 'ADAMW':
+            param['weight_decay'] = weight_decay
+        params += [param]
+    def maybe_add_full_model_gradient_clipping(optim):  # optim: the optimizer class
+        # detectron2 doesn't have full model gradient clipping now
+        clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
+        enable = (
+            cfg.SOLVER.CLIP_GRADIENTS.ENABLED
+            and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
+            and clip_norm_val > 0.0
+        )
+        class FullModelGradientClippingOptimizer(optim):
+            def step(self, closure=None):
+                all_params = itertools.chain(*[x["params"] for x in self.param_groups])
+                torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
+                super().step(closure=closure)
+        return FullModelGradientClippingOptimizer if enable else optim
+    if optimizer_type == 'SGD':
+        optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
+            params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM,
+            nesterov=cfg.SOLVER.NESTEROV
+        )
+    elif optimizer_type == 'ADAMW':
+        optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
+            params, cfg.SOLVER.BASE_LR,
+            weight_decay=cfg.SOLVER.WEIGHT_DECAY
+        )
+    else:
+        raise NotImplementedError(f"no optimizer type {optimizer_type}")
+    if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
+        optimizer = maybe_add_gradient_clipping(cfg, optimizer)
+    return optimizer

proxydet/data/custom_build_augmentation.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from fvcore.common.file_io import PathManager
+from PIL import Image
+from detectron2.data import transforms as T
+from .transforms.custom_augmentation_impl import EfficientDetResizeCrop
+def build_custom_augmentation(cfg, is_train, scale=None, size=None, \
+    min_size=None, max_size=None):
+    """
+    Create a list of default :class:`Augmentation` from config.
+    Now it includes resizing and flipping.
+    Returns:
+        list[Augmentation]
+    """
+    if cfg.INPUT.CUSTOM_AUG == 'ResizeShortestEdge':
+        if is_train:
+            min_size = cfg.INPUT.MIN_SIZE_TRAIN if min_size is None else min_size
+            max_size = cfg.INPUT.MAX_SIZE_TRAIN if max_size is None else max_size
+            sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
+        else:
+            min_size = cfg.INPUT.MIN_SIZE_TEST
+            max_size = cfg.INPUT.MAX_SIZE_TEST
+            sample_style = "choice"
+        augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
+    elif cfg.INPUT.CUSTOM_AUG == 'EfficientDetResizeCrop':
+        if is_train:
+            scale = cfg.INPUT.SCALE_RANGE if scale is None else scale
+            size = cfg.INPUT.TRAIN_SIZE if size is None else size
+        else:
+            scale = (1, 1)
+            size = cfg.INPUT.TEST_SIZE
+        augmentation = [EfficientDetResizeCrop(size, scale)]
+    else:
+        assert 0, cfg.INPUT.CUSTOM_AUG
+    if is_train:
+        augmentation.append(T.RandomFlip())
+    return augmentation
+build_custom_transform_gen = build_custom_augmentation
+"""
+Alias for backward-compatibility.
+"""

proxydet/data/custom_dataset_dataloader.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Part of the code is from https://github.com/xingyizhou/UniDet/blob/master/projects/UniDet/unidet/data/multi_dataset_dataloader.py (Apache-2.0 License)
+import copy
+import logging
+import numpy as np
+import operator
+import torch
+import torch.utils.data
+import json
+from detectron2.utils.comm import get_world_size
+from detectron2.utils.logger import _log_api_usage, log_first_n
+from detectron2.config import configurable
+from detectron2.data import samplers
+from torch.utils.data.sampler import BatchSampler, Sampler
+from detectron2.data.common import DatasetFromList, MapDataset
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.build import get_detection_dataset_dicts, build_batch_data_loader
+from detectron2.data.samplers import TrainingSampler, RepeatFactorTrainingSampler
+from detectron2.data.build import worker_init_reset_seed, print_instances_class_histogram
+from detectron2.data.build import filter_images_with_only_crowd_annotations
+from detectron2.data.build import filter_images_with_few_keypoints
+from detectron2.data.build import check_metadata_consistency
+from detectron2.data.catalog import MetadataCatalog, DatasetCatalog
+from detectron2.utils import comm
+import itertools
+import math
+from collections import defaultdict
+from typing import Optional
+def _custom_train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
+    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+    if 'MultiDataset' in sampler_name:
+        dataset_dicts = get_detection_dataset_dicts_with_source(
+            cfg.DATASETS.TRAIN,
+            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+            if cfg.MODEL.KEYPOINT_ON else 0,
+            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+        )
+    else:
+        dataset_dicts = get_detection_dataset_dicts(
+            cfg.DATASETS.TRAIN,
+            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+            if cfg.MODEL.KEYPOINT_ON else 0,
+            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+        )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    if sampler is not None:
+        pass
+    elif sampler_name == "TrainingSampler":
+        sampler = TrainingSampler(len(dataset))
+    elif sampler_name == "MultiDatasetSampler":
+        sampler = MultiDatasetSampler(
+            dataset_dicts,
+            dataset_ratio = cfg.DATALOADER.DATASET_RATIO,
+            use_rfs = cfg.DATALOADER.USE_RFS,
+            dataset_ann = cfg.DATALOADER.DATASET_ANN,
+            repeat_threshold = cfg.DATALOADER.REPEAT_THRESHOLD,
+        )
+    elif sampler_name == "RepeatFactorTrainingSampler":
+        repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
+            dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD
+        )
+        sampler = RepeatFactorTrainingSampler(repeat_factors)
+    else:
+        raise ValueError("Unknown training sampler: {}".format(sampler_name))
+    return {
+        "dataset": dataset_dicts,
+        "sampler": sampler,
+        "mapper": mapper,
+        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
+        "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
+        "num_workers": cfg.DATALOADER.NUM_WORKERS,
+        'multi_dataset_grouping': cfg.DATALOADER.MULTI_DATASET_GROUPING,
+        'use_diff_bs_size': cfg.DATALOADER.USE_DIFF_BS_SIZE,
+        'dataset_bs': cfg.DATALOADER.DATASET_BS,
+        'num_datasets': len(cfg.DATASETS.TRAIN)
+    }
+@configurable(from_config=_custom_train_loader_from_config)
+def build_custom_train_loader(
+        dataset, *, mapper, sampler,
+        total_batch_size=16,
+        aspect_ratio_grouping=True,
+        num_workers=0,
+        num_datasets=1,
+        multi_dataset_grouping=False,
+        use_diff_bs_size=False,
+        dataset_bs=[]
+    ):
+    """
+    Modified from detectron2.data.build.build_custom_train_loader, but supports
+    different samplers
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = TrainingSampler(len(dataset))
+    assert isinstance(sampler, torch.utils.data.sampler.Sampler)
+    if multi_dataset_grouping:
+        return build_multi_dataset_batch_data_loader(
+            use_diff_bs_size,
+            dataset_bs,
+            dataset,
+            sampler,
+            total_batch_size,
+            num_datasets=num_datasets,
+            num_workers=num_workers,
+        )
+    else:
+        return build_batch_data_loader(
+            dataset,
+            sampler,
+            total_batch_size,
+            aspect_ratio_grouping=aspect_ratio_grouping,
+            num_workers=num_workers,
+        )
+def build_multi_dataset_batch_data_loader(
+    use_diff_bs_size, dataset_bs,
+    dataset, sampler, total_batch_size, num_datasets, num_workers=0
+):
+    """
+    """
+    world_size = get_world_size()
+    assert (
+        total_batch_size > 0 and total_batch_size % world_size == 0
+    ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format(
+        total_batch_size, world_size
+    )
+    batch_size = total_batch_size // world_size
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        num_workers=num_workers,
+        batch_sampler=None,
+        collate_fn=operator.itemgetter(0),  # don't batch, but yield individual elements
+        worker_init_fn=worker_init_reset_seed,
+    )  # yield individual mapped dict
+    if use_diff_bs_size:
+        return DIFFMDAspectRatioGroupedDataset(
+            data_loader, dataset_bs, num_datasets)
+    else:
+        return MDAspectRatioGroupedDataset(
+            data_loader, batch_size, num_datasets)
+def get_detection_dataset_dicts_with_source(
+    dataset_names, filter_empty=True, min_keypoints=0, proposal_files=None
+):
+    assert len(dataset_names)
+    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
+    for dataset_name, dicts in zip(dataset_names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+    for source_id, (dataset_name, dicts) in \
+        enumerate(zip(dataset_names, dataset_dicts)):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+        for d in dicts:
+            d['dataset_source'] = source_id
+        if "annotations" in dicts[0]:
+            try:
+                class_names = MetadataCatalog.get(dataset_name).thing_classes
+                check_metadata_consistency("thing_classes", dataset_name)
+                print_instances_class_histogram(dicts, class_names)
+            except AttributeError:  # class names are not available for this dataset
+                pass
+    assert proposal_files is None
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+    has_instances = "annotations" in dataset_dicts[0]
+    if filter_empty and has_instances:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
+    if min_keypoints > 0 and has_instances:
+        dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)
+    return dataset_dicts
+class MultiDatasetSampler(Sampler):
+    def __init__(
+        self,
+        dataset_dicts,
+        dataset_ratio,
+        use_rfs,
+        dataset_ann,
+        repeat_threshold=0.001,
+        seed: Optional[int] = None,
+        ):
+        """
+        """
+        sizes = [0 for _ in range(len(dataset_ratio))]
+        for d in dataset_dicts:
+            sizes[d['dataset_source']] += 1
+        print('dataset sizes', sizes)
+        self.sizes = sizes
+        assert len(dataset_ratio) == len(sizes), \
+            'length of dataset ratio {} should be equal to number if dataset {}'.format(
+                len(dataset_ratio), len(sizes)
+            )
+        if seed is None:
+            seed = comm.shared_random_seed()
+        self._seed = int(seed)
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+        self.dataset_ids =  torch.tensor(
+            [d['dataset_source'] for d in dataset_dicts], dtype=torch.long)
+        dataset_weight = [torch.ones(s) * max(sizes) / s * r / sum(dataset_ratio) \
+            for i, (r, s) in enumerate(zip(dataset_ratio, sizes))]
+        dataset_weight = torch.cat(dataset_weight)
+        rfs_factors = []
+        st = 0
+        for i, s in enumerate(sizes):
+            if use_rfs[i]:
+                if dataset_ann[i] == 'box':
+                    rfs_func = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency
+                else:
+                    rfs_func = repeat_factors_from_tag_frequency
+                rfs_factor = rfs_func(
+                    dataset_dicts[st: st + s],
+                    repeat_thresh=repeat_threshold)
+                rfs_factor = rfs_factor * (s / rfs_factor.sum())
+            else:
+                rfs_factor = torch.ones(s)
+            rfs_factors.append(rfs_factor)
+            st = st + s
+        rfs_factors = torch.cat(rfs_factors)
+        self.weights = dataset_weight * rfs_factors
+        self.sample_epoch_size = len(self.weights)
+    def __iter__(self):
+        start = self._rank
+        yield from itertools.islice(
+            self._infinite_indices(), start, None, self._world_size)
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)
+        while True:
+            ids = torch.multinomial(
+                self.weights, self.sample_epoch_size, generator=g,
+                replacement=True)
+            nums = [(self.dataset_ids[ids] == i).sum().int().item() \
+                for i in range(len(self.sizes))]
+            yield from ids
+class MDAspectRatioGroupedDataset(torch.utils.data.IterableDataset):
+    def __init__(self, dataset, batch_size, num_datasets):
+        """
+        """
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self._buckets = [[] for _ in range(2 * num_datasets)]
+    def __iter__(self):
+        for d in self.dataset:
+            w, h = d["width"], d["height"]
+            aspect_ratio_bucket_id = 0 if w > h else 1
+            bucket_id = d['dataset_source'] * 2 + aspect_ratio_bucket_id
+            bucket = self._buckets[bucket_id]
+            bucket.append(d)
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+class DIFFMDAspectRatioGroupedDataset(torch.utils.data.IterableDataset):
+    def __init__(self, dataset, batch_sizes, num_datasets):
+        """
+        """
+        self.dataset = dataset
+        self.batch_sizes = batch_sizes
+        self._buckets = [[] for _ in range(2 * num_datasets)]
+    def __iter__(self):
+        for d in self.dataset:
+            w, h = d["width"], d["height"]
+            aspect_ratio_bucket_id = 0 if w > h else 1
+            bucket_id = d['dataset_source'] * 2 + aspect_ratio_bucket_id
+            bucket = self._buckets[bucket_id]
+            bucket.append(d)
+            if len(bucket) == self.batch_sizes[d['dataset_source']]:
+                yield bucket[:]
+                del bucket[:]
+def repeat_factors_from_tag_frequency(dataset_dicts, repeat_thresh):
+    """
+    """
+    category_freq = defaultdict(int)
+    for dataset_dict in dataset_dicts:
+        cat_ids = dataset_dict['pos_category_ids']
+        for cat_id in cat_ids:
+            category_freq[cat_id] += 1
+    num_images = len(dataset_dicts)
+    for k, v in category_freq.items():
+        category_freq[k] = v / num_images
+    category_rep = {
+        cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
+        for cat_id, cat_freq in category_freq.items()
+    }
+    rep_factors = []
+    for dataset_dict in dataset_dicts:
+        cat_ids = dataset_dict['pos_category_ids']
+        rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0)
+        rep_factors.append(rep_factor)
+    return torch.tensor(rep_factors, dtype=torch.float32)

proxydet/data/custom_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import numpy as np
+from typing import List, Optional, Union
+import torch
+import pycocotools.mask as mask_util
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data.detection_utils import transform_keypoint_annotations
+from detectron2.data import transforms as T
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.structures import Boxes, BoxMode, Instances
+from detectron2.structures import Keypoints, PolygonMasks, BitMasks
+from fvcore.transforms.transform import TransformList
+from .custom_build_augmentation import build_custom_augmentation
+from .tar_dataset import DiskTarDataset
+__all__ = ["CustomDatasetMapper"]
+class CustomDatasetMapper(DatasetMapper):
+    @configurable
+    def __init__(self, is_train: bool,
+        with_ann_type=False,
+        dataset_ann=[],
+        use_diff_bs_size=False,
+        dataset_augs=[],
+        is_debug=False,
+        use_tar_dataset=False,
+        tarfile_path='',
+        tar_index_dir='',
+        **kwargs):
+        """
+        add image labels
+        """
+        self.with_ann_type = with_ann_type
+        self.dataset_ann = dataset_ann
+        self.use_diff_bs_size = use_diff_bs_size
+        if self.use_diff_bs_size and is_train:
+            self.dataset_augs = [T.AugmentationList(x) for x in dataset_augs]
+        self.is_debug = is_debug
+        self.use_tar_dataset = use_tar_dataset
+        if self.use_tar_dataset:
+            print('Using tar dataset')
+            self.tar_dataset = DiskTarDataset(tarfile_path, tar_index_dir)
+        super().__init__(is_train, **kwargs)
+    @classmethod
+    def from_config(cls, cfg, is_train: bool = True):
+        ret = super().from_config(cfg, is_train)
+        ret.update({
+            'with_ann_type': cfg.WITH_IMAGE_LABELS,
+            'dataset_ann': cfg.DATALOADER.DATASET_ANN,
+            'use_diff_bs_size': cfg.DATALOADER.USE_DIFF_BS_SIZE,
+            'is_debug': cfg.IS_DEBUG,
+            'use_tar_dataset': cfg.DATALOADER.USE_TAR_DATASET,
+            'tarfile_path': cfg.DATALOADER.TARFILE_PATH,
+            'tar_index_dir': cfg.DATALOADER.TAR_INDEX_DIR,
+        })
+        if ret['use_diff_bs_size'] and is_train:
+            if cfg.INPUT.CUSTOM_AUG == 'EfficientDetResizeCrop':
+                dataset_scales = cfg.DATALOADER.DATASET_INPUT_SCALE
+                dataset_sizes = cfg.DATALOADER.DATASET_INPUT_SIZE
+                ret['dataset_augs'] = [
+                    build_custom_augmentation(cfg, True, scale, size) \
+                        for scale, size in zip(dataset_scales, dataset_sizes)]
+            else:
+                assert cfg.INPUT.CUSTOM_AUG == 'ResizeShortestEdge'
+                min_sizes = cfg.DATALOADER.DATASET_MIN_SIZES
+                max_sizes = cfg.DATALOADER.DATASET_MAX_SIZES
+                ret['dataset_augs'] = [
+                    build_custom_augmentation(
+                        cfg, True, min_size=mi, max_size=ma) \
+                        for mi, ma in zip(min_sizes, max_sizes)]
+        else:
+            ret['dataset_augs'] = []
+        return ret
+    def __call__(self, dataset_dict):
+        """
+        include image labels
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        # USER: Write your own image loading if it's not from a file
+        if 'file_name' in dataset_dict:
+            ori_image = utils.read_image(
+                dataset_dict["file_name"], format=self.image_format)
+        else:
+            ori_image, _, _ = self.tar_dataset[dataset_dict["tar_index"]]
+            ori_image = utils._apply_exif_orientation(ori_image)
+            ori_image = utils.convert_PIL_to_numpy(ori_image, self.image_format)
+        utils.check_image_size(dataset_dict, ori_image)
+        # USER: Remove if you don't do semantic/panoptic segmentation.
+        if "sem_seg_file_name" in dataset_dict:
+            sem_seg_gt = utils.read_image(
+                dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2)
+        else:
+            sem_seg_gt = None
+        if self.is_debug:
+            dataset_dict['dataset_source'] = 0
+        not_full_labeled = 'dataset_source' in dataset_dict and \
+            self.with_ann_type and \
+                self.dataset_ann[dataset_dict['dataset_source']] != 'box'
+        aug_input = T.AugInput(copy.deepcopy(ori_image), sem_seg=sem_seg_gt)
+        if self.use_diff_bs_size and self.is_train:
+            transforms = \
+                self.dataset_augs[dataset_dict['dataset_source']](aug_input)
+        else:
+            transforms = self.augmentations(aug_input)
+        image, sem_seg_gt = aug_input.image, aug_input.sem_seg
+        image_shape = image.shape[:2]  # h, w
+        dataset_dict["image"] = torch.as_tensor(
+            np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if sem_seg_gt is not None:
+            dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
+        # USER: Remove if you don't use pre-computed proposals.
+        # Most users would not need this feature.
+        if self.proposal_topk is not None:
+            utils.transform_proposals(
+                dataset_dict, image_shape, transforms,
+                proposal_topk=self.proposal_topk
+            )
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            dataset_dict.pop("sem_seg_file_name", None)
+            return dataset_dict
+        if "annotations" in dataset_dict:
+            # USER: Modify this if you want to keep them for some reason.
+            for anno in dataset_dict["annotations"]:
+                if not self.use_instance_mask:
+                    anno.pop("segmentation", None)
+                if not self.use_keypoint:
+                    anno.pop("keypoints", None)
+            # USER: Implement additional transformations if you have other types of data
+            all_annos = [
+                (utils.transform_instance_annotations(
+                    obj, transforms, image_shape,
+                    keypoint_hflip_indices=self.keypoint_hflip_indices,
+                ),  obj.get("iscrowd", 0))
+                for obj in dataset_dict.pop("annotations")
+            ]
+            annos = [ann[0] for ann in all_annos if ann[1] == 0]
+            instances = utils.annotations_to_instances(
+                annos, image_shape, mask_format=self.instance_mask_format
+            )
+            del all_annos
+            if self.recompute_boxes:
+                instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+            dataset_dict["instances"] = utils.filter_empty_instances(instances)
+        if self.with_ann_type:
+            dataset_dict["pos_category_ids"] = dataset_dict.get(
+                'pos_category_ids', [])
+            dataset_dict["ann_type"] = \
+                self.dataset_ann[dataset_dict['dataset_source']]
+        if self.is_debug and (('pos_category_ids' not in dataset_dict) or \
+            (dataset_dict['pos_category_ids'] == [])):
+            dataset_dict['pos_category_ids'] = [x for x in sorted(set(
+                dataset_dict['instances'].gt_classes.tolist()
+            ))]
+        return dataset_dict
+# DETR augmentation
+def build_transform_gen(cfg, is_train):
+    """
+    """
+    if is_train:
+        min_size = cfg.INPUT.MIN_SIZE_TRAIN
+        max_size = cfg.INPUT.MAX_SIZE_TRAIN
+        sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
+    else:
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        sample_style = "choice"
+    if sample_style == "range":
+        assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size))
+    logger = logging.getLogger(__name__)
+    tfm_gens = []
+    if is_train:
+        tfm_gens.append(T.RandomFlip())
+    tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
+    if is_train:
+        logger.info("TransformGens used in training: " + str(tfm_gens))
+    return tfm_gens
+class DetrDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by DETR.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    def __init__(self, cfg, is_train=True):
+        if cfg.INPUT.CROP.ENABLED and is_train:
+            self.crop_gen = [
+                T.ResizeShortestEdge([400, 500, 600], sample_style="choice"),
+                T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE),
+            ]
+        else:
+            self.crop_gen = None
+        self.mask_on = cfg.MODEL.MASK_ON
+        self.tfm_gens = build_transform_gen(cfg, is_train)
+        logging.getLogger(__name__).info(
+            "Full TransformGens used in training: {}, crop: {}".format(str(self.tfm_gens), str(self.crop_gen))
+        )
+        self.img_format = cfg.INPUT.FORMAT
+        self.is_train = is_train
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        if self.crop_gen is None:
+            image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        else:
+            if np.random.rand() > 0.5:
+                image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+            else:
+                image, transforms = T.apply_transform_gens(
+                    self.tfm_gens[:-1] + self.crop_gen + self.tfm_gens[-1:], image
+                )
+        image_shape = image.shape[:2]  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+        if "annotations" in dataset_dict:
+            # USER: Modify this if you want to keep them for some reason.
+            for anno in dataset_dict["annotations"]:
+                if not self.mask_on:
+                    anno.pop("segmentation", None)
+                anno.pop("keypoints", None)
+            # USER: Implement additional transformations if you have other types of data
+            annos = [
+                utils.transform_instance_annotations(obj, transforms, image_shape)
+                for obj in dataset_dict.pop("annotations")
+                if obj.get("iscrowd", 0) == 0
+            ]
+            instances = utils.annotations_to_instances(annos, image_shape)
+            dataset_dict["instances"] = utils.filter_empty_instances(instances)
+        return dataset_dict

proxydet/data/datasets/cc.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import os
+from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
+from detectron2.data.datasets.lvis import get_lvis_instances_meta
+from .lvis_v1 import custom_register_lvis_instances
+_CUSTOM_SPLITS = {
+    "cc3m_v1_val": ("cc3m/validation/", "cc3m/val_image_info.json"),
+    "cc3m_v1_train": ("cc3m/training/", "cc3m/train_image_info.json"),
+    "cc3m_v1_train_tags": ("cc3m/training/", "cc3m/train_image_info_tags.json"),
+}
+for key, (image_root, json_file) in _CUSTOM_SPLITS.items():
+    custom_register_lvis_instances(
+        key,
+        get_lvis_instances_meta('lvis_v1'),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )

proxydet/data/datasets/coco_zeroshot.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+from detectron2.data.datasets.register_coco import register_coco_instances
+from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
+from .lvis_v1 import custom_register_lvis_instances
+categories_seen = [
+    {'id': 1, 'name': 'person'},
+    {'id': 2, 'name': 'bicycle'},
+    {'id': 3, 'name': 'car'},
+    {'id': 4, 'name': 'motorcycle'},
+    {'id': 7, 'name': 'train'},
+    {'id': 8, 'name': 'truck'},
+    {'id': 9, 'name': 'boat'},
+    {'id': 15, 'name': 'bench'},
+    {'id': 16, 'name': 'bird'},
+    {'id': 19, 'name': 'horse'},
+    {'id': 20, 'name': 'sheep'},
+    {'id': 23, 'name': 'bear'},
+    {'id': 24, 'name': 'zebra'},
+    {'id': 25, 'name': 'giraffe'},
+    {'id': 27, 'name': 'backpack'},
+    {'id': 31, 'name': 'handbag'},
+    {'id': 33, 'name': 'suitcase'},
+    {'id': 34, 'name': 'frisbee'},
+    {'id': 35, 'name': 'skis'},
+    {'id': 38, 'name': 'kite'},
+    {'id': 42, 'name': 'surfboard'},
+    {'id': 44, 'name': 'bottle'},
+    {'id': 48, 'name': 'fork'},
+    {'id': 50, 'name': 'spoon'},
+    {'id': 51, 'name': 'bowl'},
+    {'id': 52, 'name': 'banana'},
+    {'id': 53, 'name': 'apple'},
+    {'id': 54, 'name': 'sandwich'},
+    {'id': 55, 'name': 'orange'},
+    {'id': 56, 'name': 'broccoli'},
+    {'id': 57, 'name': 'carrot'},
+    {'id': 59, 'name': 'pizza'},
+    {'id': 60, 'name': 'donut'},
+    {'id': 62, 'name': 'chair'},
+    {'id': 65, 'name': 'bed'},
+    {'id': 70, 'name': 'toilet'},
+    {'id': 72, 'name': 'tv'},
+    {'id': 73, 'name': 'laptop'},
+    {'id': 74, 'name': 'mouse'},
+    {'id': 75, 'name': 'remote'},
+    {'id': 78, 'name': 'microwave'},
+    {'id': 79, 'name': 'oven'},
+    {'id': 80, 'name': 'toaster'},
+    {'id': 82, 'name': 'refrigerator'},
+    {'id': 84, 'name': 'book'},
+    {'id': 85, 'name': 'clock'},
+    {'id': 86, 'name': 'vase'},
+    {'id': 90, 'name': 'toothbrush'},
+]
+categories_unseen = [
+    {'id': 5, 'name': 'airplane'},
+    {'id': 6, 'name': 'bus'},
+    {'id': 17, 'name': 'cat'},
+    {'id': 18, 'name': 'dog'},
+    {'id': 21, 'name': 'cow'},
+    {'id': 22, 'name': 'elephant'},
+    {'id': 28, 'name': 'umbrella'},
+    {'id': 32, 'name': 'tie'},
+    {'id': 36, 'name': 'snowboard'},
+    {'id': 41, 'name': 'skateboard'},
+    {'id': 47, 'name': 'cup'},
+    {'id': 49, 'name': 'knife'},
+    {'id': 61, 'name': 'cake'},
+    {'id': 63, 'name': 'couch'},
+    {'id': 76, 'name': 'keyboard'},
+    {'id': 81, 'name': 'sink'},
+    {'id': 87, 'name': 'scissors'},
+]
+def _get_metadata(cat):
+    if cat == 'all':
+        return _get_builtin_metadata('coco')
+    elif cat == 'seen':
+        id_to_name = {x['id']: x['name'] for x in categories_seen}
+    else:
+        assert cat == 'unseen'
+        id_to_name = {x['id']: x['name'] for x in categories_unseen}
+    thing_dataset_id_to_contiguous_id = {
+        x: i for i, x in enumerate(sorted(id_to_name))}
+    thing_classes = [id_to_name[k] for k in sorted(id_to_name)]
+    return {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes}
+_PREDEFINED_SPLITS_COCO = {
+    "coco_zeroshot_train": ("coco/train2017", "coco/zero-shot/instances_train2017_seen_2.json", 'seen'),
+    "coco_zeroshot_val": ("coco/val2017", "coco/zero-shot/instances_val2017_unseen_2.json", 'unseen'),
+    "coco_not_zeroshot_val": ("coco/val2017", "coco/zero-shot/instances_val2017_seen_2.json", 'seen'),
+    "coco_generalized_zeroshot_val": ("coco/val2017", "coco/zero-shot/instances_val2017_all_2_oriorder.json", 'all'),
+    "coco_zeroshot_train_oriorder": ("coco/train2017", "coco/zero-shot/instances_train2017_seen_2_oriorder.json", 'all'),
+}
+for key, (image_root, json_file, cat) in _PREDEFINED_SPLITS_COCO.items():
+    register_coco_instances(
+        key,
+        _get_metadata(cat),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )
+_CUSTOM_SPLITS_COCO = {
+    "cc3m_coco_train_tags": ("cc3m/training/", "cc3m/coco_train_image_info_tags.json"),
+    "coco_caption_train_tags": ("coco/train2017/", "coco/annotations/captions_train2017_tags_allcaps.json"),}
+for key, (image_root, json_file) in _CUSTOM_SPLITS_COCO.items():
+    custom_register_lvis_instances(
+        key,
+        _get_builtin_metadata('coco'),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )

proxydet/data/datasets/imagenet.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import os
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets.lvis import get_lvis_instances_meta
+from .lvis_v1 import custom_load_lvis_json, get_lvis_22k_meta
+def custom_register_imagenet_instances(name, metadata, json_file, image_root):
+    """
+    """
+    DatasetCatalog.register(name, lambda: custom_load_lvis_json(
+        json_file, image_root, name))
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root,
+        evaluator_type="imagenet", **metadata
+    )
+_CUSTOM_SPLITS_IMAGENET = {
+    "imagenet_lvis_v1": ("imagenet/ImageNet-LVIS/", "imagenet/annotations/imagenet_lvis_image_info.json"),
+}
+for key, (image_root, json_file) in _CUSTOM_SPLITS_IMAGENET.items():
+    custom_register_imagenet_instances(
+        key,
+        get_lvis_instances_meta('lvis_v1'),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )
+_CUSTOM_SPLITS_IMAGENET_22K = {
+    "imagenet_lvis-22k": ("imagenet/ImageNet-LVIS/", "imagenet/annotations/imagenet-22k_image_info_lvis-22k.json"),
+}
+for key, (image_root, json_file) in _CUSTOM_SPLITS_IMAGENET_22K.items():
+    custom_register_imagenet_instances(
+        key,
+        get_lvis_22k_meta(),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )

proxydet/data/datasets/lvis_22k_categories.py ADDED Viewed

The diff for this file is too large to render. See raw diff

proxydet/data/datasets/lvis_v1.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import os
+from fvcore.common.timer import Timer
+from detectron2.structures import BoxMode
+from fvcore.common.file_io import PathManager
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets.lvis import get_lvis_instances_meta
+logger = logging.getLogger(__name__)
+__all__ = ["custom_load_lvis_json", "custom_register_lvis_instances"]
+def custom_register_lvis_instances(name, metadata, json_file, image_root):
+    """
+    """
+    DatasetCatalog.register(name, lambda: custom_load_lvis_json(
+        json_file, image_root, name))
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root,
+        evaluator_type="lvis", **metadata
+    )
+def custom_load_lvis_json(json_file, image_root, dataset_name=None):
+    '''
+    Modifications:
+      use `file_name`
+      convert neg_category_ids
+      add pos_category_ids
+    '''
+    from lvis import LVIS
+    json_file = PathManager.get_local_path(json_file)
+    timer = Timer()
+    lvis_api = LVIS(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(
+            json_file, timer.seconds()))
+    catid2contid = {x['id']: i for i, x in enumerate(
+        sorted(lvis_api.dataset['categories'], key=lambda x: x['id']))}
+    if len(lvis_api.dataset['categories']) == 1203:
+        for x in lvis_api.dataset['categories']:
+            assert catid2contid[x['id']] == x['id'] - 1
+    img_ids = sorted(lvis_api.imgs.keys())
+    imgs = lvis_api.load_imgs(img_ids)
+    anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(ann_ids), \
+        "Annotation ids in '{}' are not unique".format(json_file)
+    imgs_anns = list(zip(imgs, anns))
+    logger.info("Loaded {} images in the LVIS v1 format from {}".format(
+        len(imgs_anns), json_file))
+    dataset_dicts = []
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        if "file_name" in img_dict:
+            file_name = img_dict["file_name"]
+            if img_dict["file_name"].startswith("COCO"):
+                file_name = file_name[-16:]
+            record["file_name"] = os.path.join(image_root, file_name)
+        elif 'coco_url' in img_dict:
+            # e.g., http://images.cocodataset.org/train2017/000000391895.jpg
+            file_name = img_dict["coco_url"][30:]
+            record["file_name"] = os.path.join(image_root, file_name)
+        elif 'tar_index' in img_dict:
+            record['tar_index'] = img_dict['tar_index']
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        record["not_exhaustive_category_ids"] = img_dict.get(
+            "not_exhaustive_category_ids", [])
+        record["neg_category_ids"] = img_dict.get("neg_category_ids", [])
+        # NOTE: modified by Xingyi: convert to 0-based
+        record["neg_category_ids"] = [
+            catid2contid[x] for x in record["neg_category_ids"]]
+        if 'pos_category_ids' in img_dict:
+            record['pos_category_ids'] = [
+                catid2contid[x] for x in img_dict.get("pos_category_ids", [])]
+        if 'captions' in img_dict:
+            record['captions'] = img_dict['captions']
+        if 'caption_features' in img_dict:
+            record['caption_features'] = img_dict['caption_features']
+        image_id = record["image_id"] = img_dict["id"]
+        objs = []
+        for anno in anno_dict_list:
+            assert anno["image_id"] == image_id
+            if anno.get('iscrowd', 0) > 0:
+                continue
+            obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
+            obj["category_id"] = catid2contid[anno['category_id']]
+            if 'segmentation' in anno:
+                segm = anno["segmentation"]
+                valid_segm = [poly for poly in segm \
+                    if len(poly) % 2 == 0 and len(poly) >= 6]
+                # assert len(segm) == len(
+                #     valid_segm
+                # ), "Annotation contains an invalid polygon with < 3 points"
+                if not len(segm) == len(valid_segm):
+                    print('Annotation contains an invalid polygon with < 3 points')
+                assert len(segm) > 0
+                obj["segmentation"] = segm
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+    return dataset_dicts
+_CUSTOM_SPLITS_LVIS = {
+    "lvis_v1_train+coco": ("coco/", "lvis/lvis_v1_train+coco_mask.json"),
+    "lvis_v1_train_norare": ("coco/", "lvis/lvis_v1_train_norare.json"),
+}
+for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS.items():
+    custom_register_lvis_instances(
+        key,
+        get_lvis_instances_meta(key),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )
+def get_lvis_22k_meta():
+    from .lvis_22k_categories import CATEGORIES
+    cat_ids = [k["id"] for k in CATEGORIES]
+    assert min(cat_ids) == 1 and max(cat_ids) == len(
+        cat_ids
+    ), "Category ids are not in [1, #categories], as expected"
+    # Ensure that the category list is sorted by id
+    lvis_categories = sorted(CATEGORIES, key=lambda x: x["id"])
+    thing_classes = [k["name"] for k in lvis_categories]
+    meta = {"thing_classes": thing_classes}
+    return meta
+_CUSTOM_SPLITS_LVIS_22K = {
+    "lvis_v1_train_22k": ("coco/", "lvis/lvis_v1_train_lvis-22k.json"),
+}
+for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS_22K.items():
+    custom_register_lvis_instances(
+        key,
+        get_lvis_22k_meta(),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )

proxydet/data/datasets/objects365.py ADDED Viewed

	@@ -0,0 +1,770 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from detectron2.data.datasets.register_coco import register_coco_instances
+import os
+# categories_v2 = [
+#     {'id': 1, 'name': 'Person'},
+#     {'id': 2, 'name': 'Sneakers'},
+#     {'id': 3, 'name': 'Chair'},
+#     {'id': 4, 'name': 'Other Shoes'},
+#     {'id': 5, 'name': 'Hat'},
+#     {'id': 6, 'name': 'Car'},
+#     {'id': 7, 'name': 'Lamp'},
+#     {'id': 8, 'name': 'Glasses'},
+#     {'id': 9, 'name': 'Bottle'},
+#     {'id': 10, 'name': 'Desk'},
+#     {'id': 11, 'name': 'Cup'},
+#     {'id': 12, 'name': 'Street Lights'},
+#     {'id': 13, 'name': 'Cabinet/shelf'},
+#     {'id': 14, 'name': 'Handbag/Satchel'},
+#     {'id': 15, 'name': 'Bracelet'},
+#     {'id': 16, 'name': 'Plate'},
+#     {'id': 17, 'name': 'Picture/Frame'},
+#     {'id': 18, 'name': 'Helmet'},
+#     {'id': 19, 'name': 'Book'},
+#     {'id': 20, 'name': 'Gloves'},
+#     {'id': 21, 'name': 'Storage box'},
+#     {'id': 22, 'name': 'Boat'},
+#     {'id': 23, 'name': 'Leather Shoes'},
+#     {'id': 24, 'name': 'Flower'},
+#     {'id': 25, 'name': 'Bench'},
+#     {'id': 26, 'name': 'Potted Plant'},
+#     {'id': 27, 'name': 'Bowl/Basin'},
+#     {'id': 28, 'name': 'Flag'},
+#     {'id': 29, 'name': 'Pillow'},
+#     {'id': 30, 'name': 'Boots'},
+#     {'id': 31, 'name': 'Vase'},
+#     {'id': 32, 'name': 'Microphone'},
+#     {'id': 33, 'name': 'Necklace'},
+#     {'id': 34, 'name': 'Ring'},
+#     {'id': 35, 'name': 'SUV'},
+#     {'id': 36, 'name': 'Wine Glass'},
+#     {'id': 37, 'name': 'Belt'},
+#     {'id': 38, 'name': 'Moniter/TV'},
+#     {'id': 39, 'name': 'Backpack'},
+#     {'id': 40, 'name': 'Umbrella'},
+#     {'id': 41, 'name': 'Traffic Light'},
+#     {'id': 42, 'name': 'Speaker'},
+#     {'id': 43, 'name': 'Watch'},
+#     {'id': 44, 'name': 'Tie'},
+#     {'id': 45, 'name': 'Trash bin Can'},
+#     {'id': 46, 'name': 'Slippers'},
+#     {'id': 47, 'name': 'Bicycle'},
+#     {'id': 48, 'name': 'Stool'},
+#     {'id': 49, 'name': 'Barrel/bucket'},
+#     {'id': 50, 'name': 'Van'},
+#     {'id': 51, 'name': 'Couch'},
+#     {'id': 52, 'name': 'Sandals'},
+#     {'id': 53, 'name': 'Bakset'},
+#     {'id': 54, 'name': 'Drum'},
+#     {'id': 55, 'name': 'Pen/Pencil'},
+#     {'id': 56, 'name': 'Bus'},
+#     {'id': 57, 'name': 'Wild Bird'},
+#     {'id': 58, 'name': 'High Heels'},
+#     {'id': 59, 'name': 'Motorcycle'},
+#     {'id': 60, 'name': 'Guitar'},
+#     {'id': 61, 'name': 'Carpet'},
+#     {'id': 62, 'name': 'Cell Phone'},
+#     {'id': 63, 'name': 'Bread'},
+#     {'id': 64, 'name': 'Camera'},
+#     {'id': 65, 'name': 'Canned'},
+#     {'id': 66, 'name': 'Truck'},
+#     {'id': 67, 'name': 'Traffic cone'},
+#     {'id': 68, 'name': 'Cymbal'},
+#     {'id': 69, 'name': 'Lifesaver'},
+#     {'id': 70, 'name': 'Towel'},
+#     {'id': 71, 'name': 'Stuffed Toy'},
+#     {'id': 72, 'name': 'Candle'},
+#     {'id': 73, 'name': 'Sailboat'},
+#     {'id': 74, 'name': 'Laptop'},
+#     {'id': 75, 'name': 'Awning'},
+#     {'id': 76, 'name': 'Bed'},
+#     {'id': 77, 'name': 'Faucet'},
+#     {'id': 78, 'name': 'Tent'},
+#     {'id': 79, 'name': 'Horse'},
+#     {'id': 80, 'name': 'Mirror'},
+#     {'id': 81, 'name': 'Power outlet'},
+#     {'id': 82, 'name': 'Sink'},
+#     {'id': 83, 'name': 'Apple'},
+#     {'id': 84, 'name': 'Air Conditioner'},
+#     {'id': 85, 'name': 'Knife'},
+#     {'id': 86, 'name': 'Hockey Stick'},
+#     {'id': 87, 'name': 'Paddle'},
+#     {'id': 88, 'name': 'Pickup Truck'},
+#     {'id': 89, 'name': 'Fork'},
+#     {'id': 90, 'name': 'Traffic Sign'},
+#     {'id': 91, 'name': 'Ballon'},
+#     {'id': 92, 'name': 'Tripod'},
+#     {'id': 93, 'name': 'Dog'},
+#     {'id': 94, 'name': 'Spoon'},
+#     {'id': 95, 'name': 'Clock'},
+#     {'id': 96, 'name': 'Pot'},
+#     {'id': 97, 'name': 'Cow'},
+#     {'id': 98, 'name': 'Cake'},
+#     {'id': 99, 'name': 'Dinning Table'},
+#     {'id': 100, 'name': 'Sheep'},
+#     {'id': 101, 'name': 'Hanger'},
+#     {'id': 102, 'name': 'Blackboard/Whiteboard'},
+#     {'id': 103, 'name': 'Napkin'},
+#     {'id': 104, 'name': 'Other Fish'},
+#     {'id': 105, 'name': 'Orange/Tangerine'},
+#     {'id': 106, 'name': 'Toiletry'},
+#     {'id': 107, 'name': 'Keyboard'},
+#     {'id': 108, 'name': 'Tomato'},
+#     {'id': 109, 'name': 'Lantern'},
+#     {'id': 110, 'name': 'Machinery Vehicle'},
+#     {'id': 111, 'name': 'Fan'},
+#     {'id': 112, 'name': 'Green Vegetables'},
+#     {'id': 113, 'name': 'Banana'},
+#     {'id': 114, 'name': 'Baseball Glove'},
+#     {'id': 115, 'name': 'Airplane'},
+#     {'id': 116, 'name': 'Mouse'},
+#     {'id': 117, 'name': 'Train'},
+#     {'id': 118, 'name': 'Pumpkin'},
+#     {'id': 119, 'name': 'Soccer'},
+#     {'id': 120, 'name': 'Skiboard'},
+#     {'id': 121, 'name': 'Luggage'},
+#     {'id': 122, 'name': 'Nightstand'},
+#     {'id': 123, 'name': 'Tea pot'},
+#     {'id': 124, 'name': 'Telephone'},
+#     {'id': 125, 'name': 'Trolley'},
+#     {'id': 126, 'name': 'Head Phone'},
+#     {'id': 127, 'name': 'Sports Car'},
+#     {'id': 128, 'name': 'Stop Sign'},
+#     {'id': 129, 'name': 'Dessert'},
+#     {'id': 130, 'name': 'Scooter'},
+#     {'id': 131, 'name': 'Stroller'},
+#     {'id': 132, 'name': 'Crane'},
+#     {'id': 133, 'name': 'Remote'},
+#     {'id': 134, 'name': 'Refrigerator'},
+#     {'id': 135, 'name': 'Oven'},
+#     {'id': 136, 'name': 'Lemon'},
+#     {'id': 137, 'name': 'Duck'},
+#     {'id': 138, 'name': 'Baseball Bat'},
+#     {'id': 139, 'name': 'Surveillance Camera'},
+#     {'id': 140, 'name': 'Cat'},
+#     {'id': 141, 'name': 'Jug'},
+#     {'id': 142, 'name': 'Broccoli'},
+#     {'id': 143, 'name': 'Piano'},
+#     {'id': 144, 'name': 'Pizza'},
+#     {'id': 145, 'name': 'Elephant'},
+#     {'id': 146, 'name': 'Skateboard'},
+#     {'id': 147, 'name': 'Surfboard'},
+#     {'id': 148, 'name': 'Gun'},
+#     {'id': 149, 'name': 'Skating and Skiing shoes'},
+#     {'id': 150, 'name': 'Gas stove'},
+#     {'id': 151, 'name': 'Donut'},
+#     {'id': 152, 'name': 'Bow Tie'},
+#     {'id': 153, 'name': 'Carrot'},
+#     {'id': 154, 'name': 'Toilet'},
+#     {'id': 155, 'name': 'Kite'},
+#     {'id': 156, 'name': 'Strawberry'},
+#     {'id': 157, 'name': 'Other Balls'},
+#     {'id': 158, 'name': 'Shovel'},
+#     {'id': 159, 'name': 'Pepper'},
+#     {'id': 160, 'name': 'Computer Box'},
+#     {'id': 161, 'name': 'Toilet Paper'},
+#     {'id': 162, 'name': 'Cleaning Products'},
+#     {'id': 163, 'name': 'Chopsticks'},
+#     {'id': 164, 'name': 'Microwave'},
+#     {'id': 165, 'name': 'Pigeon'},
+#     {'id': 166, 'name': 'Baseball'},
+#     {'id': 167, 'name': 'Cutting/chopping Board'},
+#     {'id': 168, 'name': 'Coffee Table'},
+#     {'id': 169, 'name': 'Side Table'},
+#     {'id': 170, 'name': 'Scissors'},
+#     {'id': 171, 'name': 'Marker'},
+#     {'id': 172, 'name': 'Pie'},
+#     {'id': 173, 'name': 'Ladder'},
+#     {'id': 174, 'name': 'Snowboard'},
+#     {'id': 175, 'name': 'Cookies'},
+#     {'id': 176, 'name': 'Radiator'},
+#     {'id': 177, 'name': 'Fire Hydrant'},
+#     {'id': 178, 'name': 'Basketball'},
+#     {'id': 179, 'name': 'Zebra'},
+#     {'id': 180, 'name': 'Grape'},
+#     {'id': 181, 'name': 'Giraffe'},
+#     {'id': 182, 'name': 'Potato'},
+#     {'id': 183, 'name': 'Sausage'},
+#     {'id': 184, 'name': 'Tricycle'},
+#     {'id': 185, 'name': 'Violin'},
+#     {'id': 186, 'name': 'Egg'},
+#     {'id': 187, 'name': 'Fire Extinguisher'},
+#     {'id': 188, 'name': 'Candy'},
+#     {'id': 189, 'name': 'Fire Truck'},
+#     {'id': 190, 'name': 'Billards'},
+#     {'id': 191, 'name': 'Converter'},
+#     {'id': 192, 'name': 'Bathtub'},
+#     {'id': 193, 'name': 'Wheelchair'},
+#     {'id': 194, 'name': 'Golf Club'},
+#     {'id': 195, 'name': 'Briefcase'},
+#     {'id': 196, 'name': 'Cucumber'},
+#     {'id': 197, 'name': 'Cigar/Cigarette '},
+#     {'id': 198, 'name': 'Paint Brush'},
+#     {'id': 199, 'name': 'Pear'},
+#     {'id': 200, 'name': 'Heavy Truck'},
+#     {'id': 201, 'name': 'Hamburger'},
+#     {'id': 202, 'name': 'Extractor'},
+#     {'id': 203, 'name': 'Extention Cord'},
+#     {'id': 204, 'name': 'Tong'},
+#     {'id': 205, 'name': 'Tennis Racket'},
+#     {'id': 206, 'name': 'Folder'},
+#     {'id': 207, 'name': 'American Football'},
+#     {'id': 208, 'name': 'earphone'},
+#     {'id': 209, 'name': 'Mask'},
+#     {'id': 210, 'name': 'Kettle'},
+#     {'id': 211, 'name': 'Tennis'},
+#     {'id': 212, 'name': 'Ship'},
+#     {'id': 213, 'name': 'Swing'},
+#     {'id': 214, 'name': 'Coffee Machine'},
+#     {'id': 215, 'name': 'Slide'},
+#     {'id': 216, 'name': 'Carriage'},
+#     {'id': 217, 'name': 'Onion'},
+#     {'id': 218, 'name': 'Green beans'},
+#     {'id': 219, 'name': 'Projector'},
+#     {'id': 220, 'name': 'Frisbee'},
+#     {'id': 221, 'name': 'Washing Machine/Drying Machine'},
+#     {'id': 222, 'name': 'Chicken'},
+#     {'id': 223, 'name': 'Printer'},
+#     {'id': 224, 'name': 'Watermelon'},
+#     {'id': 225, 'name': 'Saxophone'},
+#     {'id': 226, 'name': 'Tissue'},
+#     {'id': 227, 'name': 'Toothbrush'},
+#     {'id': 228, 'name': 'Ice cream'},
+#     {'id': 229, 'name': 'Hotair ballon'},
+#     {'id': 230, 'name': 'Cello'},
+#     {'id': 231, 'name': 'French Fries'},
+#     {'id': 232, 'name': 'Scale'},
+#     {'id': 233, 'name': 'Trophy'},
+#     {'id': 234, 'name': 'Cabbage'},
+#     {'id': 235, 'name': 'Hot dog'},
+#     {'id': 236, 'name': 'Blender'},
+#     {'id': 237, 'name': 'Peach'},
+#     {'id': 238, 'name': 'Rice'},
+#     {'id': 239, 'name': 'Wallet/Purse'},
+#     {'id': 240, 'name': 'Volleyball'},
+#     {'id': 241, 'name': 'Deer'},
+#     {'id': 242, 'name': 'Goose'},
+#     {'id': 243, 'name': 'Tape'},
+#     {'id': 244, 'name': 'Tablet'},
+#     {'id': 245, 'name': 'Cosmetics'},
+#     {'id': 246, 'name': 'Trumpet'},
+#     {'id': 247, 'name': 'Pineapple'},
+#     {'id': 248, 'name': 'Golf Ball'},
+#     {'id': 249, 'name': 'Ambulance'},
+#     {'id': 250, 'name': 'Parking meter'},
+#     {'id': 251, 'name': 'Mango'},
+#     {'id': 252, 'name': 'Key'},
+#     {'id': 253, 'name': 'Hurdle'},
+#     {'id': 254, 'name': 'Fishing Rod'},
+#     {'id': 255, 'name': 'Medal'},
+#     {'id': 256, 'name': 'Flute'},
+#     {'id': 257, 'name': 'Brush'},
+#     {'id': 258, 'name': 'Penguin'},
+#     {'id': 259, 'name': 'Megaphone'},
+#     {'id': 260, 'name': 'Corn'},
+#     {'id': 261, 'name': 'Lettuce'},
+#     {'id': 262, 'name': 'Garlic'},
+#     {'id': 263, 'name': 'Swan'},
+#     {'id': 264, 'name': 'Helicopter'},
+#     {'id': 265, 'name': 'Green Onion'},
+#     {'id': 266, 'name': 'Sandwich'},
+#     {'id': 267, 'name': 'Nuts'},
+#     {'id': 268, 'name': 'Speed Limit Sign'},
+#     {'id': 269, 'name': 'Induction Cooker'},
+#     {'id': 270, 'name': 'Broom'},
+#     {'id': 271, 'name': 'Trombone'},
+#     {'id': 272, 'name': 'Plum'},
+#     {'id': 273, 'name': 'Rickshaw'},
+#     {'id': 274, 'name': 'Goldfish'},
+#     {'id': 275, 'name': 'Kiwi fruit'},
+#     {'id': 276, 'name': 'Router/modem'},
+#     {'id': 277, 'name': 'Poker Card'},
+#     {'id': 278, 'name': 'Toaster'},
+#     {'id': 279, 'name': 'Shrimp'},
+#     {'id': 280, 'name': 'Sushi'},
+#     {'id': 281, 'name': 'Cheese'},
+#     {'id': 282, 'name': 'Notepaper'},
+#     {'id': 283, 'name': 'Cherry'},
+#     {'id': 284, 'name': 'Pliers'},
+#     {'id': 285, 'name': 'CD'},
+#     {'id': 286, 'name': 'Pasta'},
+#     {'id': 287, 'name': 'Hammer'},
+#     {'id': 288, 'name': 'Cue'},
+#     {'id': 289, 'name': 'Avocado'},
+#     {'id': 290, 'name': 'Hamimelon'},
+#     {'id': 291, 'name': 'Flask'},
+#     {'id': 292, 'name': 'Mushroon'},
+#     {'id': 293, 'name': 'Screwdriver'},
+#     {'id': 294, 'name': 'Soap'},
+#     {'id': 295, 'name': 'Recorder'},
+#     {'id': 296, 'name': 'Bear'},
+#     {'id': 297, 'name': 'Eggplant'},
+#     {'id': 298, 'name': 'Board Eraser'},
+#     {'id': 299, 'name': 'Coconut'},
+#     {'id': 300, 'name': 'Tape Measur/ Ruler'},
+#     {'id': 301, 'name': 'Pig'},
+#     {'id': 302, 'name': 'Showerhead'},
+#     {'id': 303, 'name': 'Globe'},
+#     {'id': 304, 'name': 'Chips'},
+#     {'id': 305, 'name': 'Steak'},
+#     {'id': 306, 'name': 'Crosswalk Sign'},
+#     {'id': 307, 'name': 'Stapler'},
+#     {'id': 308, 'name': 'Campel'},
+#     {'id': 309, 'name': 'Formula 1 '},
+#     {'id': 310, 'name': 'Pomegranate'},
+#     {'id': 311, 'name': 'Dishwasher'},
+#     {'id': 312, 'name': 'Crab'},
+#     {'id': 313, 'name': 'Hoverboard'},
+#     {'id': 314, 'name': 'Meat ball'},
+#     {'id': 315, 'name': 'Rice Cooker'},
+#     {'id': 316, 'name': 'Tuba'},
+#     {'id': 317, 'name': 'Calculator'},
+#     {'id': 318, 'name': 'Papaya'},
+#     {'id': 319, 'name': 'Antelope'},
+#     {'id': 320, 'name': 'Parrot'},
+#     {'id': 321, 'name': 'Seal'},
+#     {'id': 322, 'name': 'Buttefly'},
+#     {'id': 323, 'name': 'Dumbbell'},
+#     {'id': 324, 'name': 'Donkey'},
+#     {'id': 325, 'name': 'Lion'},
+#     {'id': 326, 'name': 'Urinal'},
+#     {'id': 327, 'name': 'Dolphin'},
+#     {'id': 328, 'name': 'Electric Drill'},
+#     {'id': 329, 'name': 'Hair Dryer'},
+#     {'id': 330, 'name': 'Egg tart'},
+#     {'id': 331, 'name': 'Jellyfish'},
+#     {'id': 332, 'name': 'Treadmill'},
+#     {'id': 333, 'name': 'Lighter'},
+#     {'id': 334, 'name': 'Grapefruit'},
+#     {'id': 335, 'name': 'Game board'},
+#     {'id': 336, 'name': 'Mop'},
+#     {'id': 337, 'name': 'Radish'},
+#     {'id': 338, 'name': 'Baozi'},
+#     {'id': 339, 'name': 'Target'},
+#     {'id': 340, 'name': 'French'},
+#     {'id': 341, 'name': 'Spring Rolls'},
+#     {'id': 342, 'name': 'Monkey'},
+#     {'id': 343, 'name': 'Rabbit'},
+#     {'id': 344, 'name': 'Pencil Case'},
+#     {'id': 345, 'name': 'Yak'},
+#     {'id': 346, 'name': 'Red Cabbage'},
+#     {'id': 347, 'name': 'Binoculars'},
+#     {'id': 348, 'name': 'Asparagus'},
+#     {'id': 349, 'name': 'Barbell'},
+#     {'id': 350, 'name': 'Scallop'},
+#     {'id': 351, 'name': 'Noddles'},
+#     {'id': 352, 'name': 'Comb'},
+#     {'id': 353, 'name': 'Dumpling'},
+#     {'id': 354, 'name': 'Oyster'},
+#     {'id': 355, 'name': 'Table Teniis paddle'},
+#     {'id': 356, 'name': 'Cosmetics Brush/Eyeliner Pencil'},
+#     {'id': 357, 'name': 'Chainsaw'},
+#     {'id': 358, 'name': 'Eraser'},
+#     {'id': 359, 'name': 'Lobster'},
+#     {'id': 360, 'name': 'Durian'},
+#     {'id': 361, 'name': 'Okra'},
+#     {'id': 362, 'name': 'Lipstick'},
+#     {'id': 363, 'name': 'Cosmetics Mirror'},
+#     {'id': 364, 'name': 'Curling'},
+#     {'id': 365, 'name': 'Table Tennis '},
+# ]
+'''
+The official Objects365 category names contains typos.
+Below is a manual fix.
+'''
+categories_v2_fix = [
+  {'id': 1, 'name': 'Person'},
+  {'id': 2, 'name': 'Sneakers'},
+  {'id': 3, 'name': 'Chair'},
+  {'id': 4, 'name': 'Other Shoes'},
+  {'id': 5, 'name': 'Hat'},
+  {'id': 6, 'name': 'Car'},
+  {'id': 7, 'name': 'Lamp'},
+  {'id': 8, 'name': 'Glasses'},
+  {'id': 9, 'name': 'Bottle'},
+  {'id': 10, 'name': 'Desk'},
+  {'id': 11, 'name': 'Cup'},
+  {'id': 12, 'name': 'Street Lights'},
+  {'id': 13, 'name': 'Cabinet/shelf'},
+  {'id': 14, 'name': 'Handbag/Satchel'},
+  {'id': 15, 'name': 'Bracelet'},
+  {'id': 16, 'name': 'Plate'},
+  {'id': 17, 'name': 'Picture/Frame'},
+  {'id': 18, 'name': 'Helmet'},
+  {'id': 19, 'name': 'Book'},
+  {'id': 20, 'name': 'Gloves'},
+  {'id': 21, 'name': 'Storage box'},
+  {'id': 22, 'name': 'Boat'},
+  {'id': 23, 'name': 'Leather Shoes'},
+  {'id': 24, 'name': 'Flower'},
+  {'id': 25, 'name': 'Bench'},
+  {'id': 26, 'name': 'Potted Plant'},
+  {'id': 27, 'name': 'Bowl/Basin'},
+  {'id': 28, 'name': 'Flag'},
+  {'id': 29, 'name': 'Pillow'},
+  {'id': 30, 'name': 'Boots'},
+  {'id': 31, 'name': 'Vase'},
+  {'id': 32, 'name': 'Microphone'},
+  {'id': 33, 'name': 'Necklace'},
+  {'id': 34, 'name': 'Ring'},
+  {'id': 35, 'name': 'SUV'},
+  {'id': 36, 'name': 'Wine Glass'},
+  {'id': 37, 'name': 'Belt'},
+  {'id': 38, 'name': 'Monitor/TV'},
+  {'id': 39, 'name': 'Backpack'},
+  {'id': 40, 'name': 'Umbrella'},
+  {'id': 41, 'name': 'Traffic Light'},
+  {'id': 42, 'name': 'Speaker'},
+  {'id': 43, 'name': 'Watch'},
+  {'id': 44, 'name': 'Tie'},
+  {'id': 45, 'name': 'Trash bin Can'},
+  {'id': 46, 'name': 'Slippers'},
+  {'id': 47, 'name': 'Bicycle'},
+  {'id': 48, 'name': 'Stool'},
+  {'id': 49, 'name': 'Barrel/bucket'},
+  {'id': 50, 'name': 'Van'},
+  {'id': 51, 'name': 'Couch'},
+  {'id': 52, 'name': 'Sandals'},
+  {'id': 53, 'name': 'Basket'},
+  {'id': 54, 'name': 'Drum'},
+  {'id': 55, 'name': 'Pen/Pencil'},
+  {'id': 56, 'name': 'Bus'},
+  {'id': 57, 'name': 'Wild Bird'},
+  {'id': 58, 'name': 'High Heels'},
+  {'id': 59, 'name': 'Motorcycle'},
+  {'id': 60, 'name': 'Guitar'},
+  {'id': 61, 'name': 'Carpet'},
+  {'id': 62, 'name': 'Cell Phone'},
+  {'id': 63, 'name': 'Bread'},
+  {'id': 64, 'name': 'Camera'},
+  {'id': 65, 'name': 'Canned'},
+  {'id': 66, 'name': 'Truck'},
+  {'id': 67, 'name': 'Traffic cone'},
+  {'id': 68, 'name': 'Cymbal'},
+  {'id': 69, 'name': 'Lifesaver'},
+  {'id': 70, 'name': 'Towel'},
+  {'id': 71, 'name': 'Stuffed Toy'},
+  {'id': 72, 'name': 'Candle'},
+  {'id': 73, 'name': 'Sailboat'},
+  {'id': 74, 'name': 'Laptop'},
+  {'id': 75, 'name': 'Awning'},
+  {'id': 76, 'name': 'Bed'},
+  {'id': 77, 'name': 'Faucet'},
+  {'id': 78, 'name': 'Tent'},
+  {'id': 79, 'name': 'Horse'},
+  {'id': 80, 'name': 'Mirror'},
+  {'id': 81, 'name': 'Power outlet'},
+  {'id': 82, 'name': 'Sink'},
+  {'id': 83, 'name': 'Apple'},
+  {'id': 84, 'name': 'Air Conditioner'},
+  {'id': 85, 'name': 'Knife'},
+  {'id': 86, 'name': 'Hockey Stick'},
+  {'id': 87, 'name': 'Paddle'},
+  {'id': 88, 'name': 'Pickup Truck'},
+  {'id': 89, 'name': 'Fork'},
+  {'id': 90, 'name': 'Traffic Sign'},
+  {'id': 91, 'name': 'Ballon'},
+  {'id': 92, 'name': 'Tripod'},
+  {'id': 93, 'name': 'Dog'},
+  {'id': 94, 'name': 'Spoon'},
+  {'id': 95, 'name': 'Clock'},
+  {'id': 96, 'name': 'Pot'},
+  {'id': 97, 'name': 'Cow'},
+  {'id': 98, 'name': 'Cake'},
+  {'id': 99, 'name': 'Dining Table'},
+  {'id': 100, 'name': 'Sheep'},
+  {'id': 101, 'name': 'Hanger'},
+  {'id': 102, 'name': 'Blackboard/Whiteboard'},
+  {'id': 103, 'name': 'Napkin'},
+  {'id': 104, 'name': 'Other Fish'},
+  {'id': 105, 'name': 'Orange/Tangerine'},
+  {'id': 106, 'name': 'Toiletry'},
+  {'id': 107, 'name': 'Keyboard'},
+  {'id': 108, 'name': 'Tomato'},
+  {'id': 109, 'name': 'Lantern'},
+  {'id': 110, 'name': 'Machinery Vehicle'},
+  {'id': 111, 'name': 'Fan'},
+  {'id': 112, 'name': 'Green Vegetables'},
+  {'id': 113, 'name': 'Banana'},
+  {'id': 114, 'name': 'Baseball Glove'},
+  {'id': 115, 'name': 'Airplane'},
+  {'id': 116, 'name': 'Mouse'},
+  {'id': 117, 'name': 'Train'},
+  {'id': 118, 'name': 'Pumpkin'},
+  {'id': 119, 'name': 'Soccer'},
+  {'id': 120, 'name': 'Skiboard'},
+  {'id': 121, 'name': 'Luggage'},
+  {'id': 122, 'name': 'Nightstand'},
+  {'id': 123, 'name': 'Teapot'},
+  {'id': 124, 'name': 'Telephone'},
+  {'id': 125, 'name': 'Trolley'},
+  {'id': 126, 'name': 'Head Phone'},
+  {'id': 127, 'name': 'Sports Car'},
+  {'id': 128, 'name': 'Stop Sign'},
+  {'id': 129, 'name': 'Dessert'},
+  {'id': 130, 'name': 'Scooter'},
+  {'id': 131, 'name': 'Stroller'},
+  {'id': 132, 'name': 'Crane'},
+  {'id': 133, 'name': 'Remote'},
+  {'id': 134, 'name': 'Refrigerator'},
+  {'id': 135, 'name': 'Oven'},
+  {'id': 136, 'name': 'Lemon'},
+  {'id': 137, 'name': 'Duck'},
+  {'id': 138, 'name': 'Baseball Bat'},
+  {'id': 139, 'name': 'Surveillance Camera'},
+  {'id': 140, 'name': 'Cat'},
+  {'id': 141, 'name': 'Jug'},
+  {'id': 142, 'name': 'Broccoli'},
+  {'id': 143, 'name': 'Piano'},
+  {'id': 144, 'name': 'Pizza'},
+  {'id': 145, 'name': 'Elephant'},
+  {'id': 146, 'name': 'Skateboard'},
+  {'id': 147, 'name': 'Surfboard'},
+  {'id': 148, 'name': 'Gun'},
+  {'id': 149, 'name': 'Skating and Skiing shoes'},
+  {'id': 150, 'name': 'Gas stove'},
+  {'id': 151, 'name': 'Donut'},
+  {'id': 152, 'name': 'Bow Tie'},
+  {'id': 153, 'name': 'Carrot'},
+  {'id': 154, 'name': 'Toilet'},
+  {'id': 155, 'name': 'Kite'},
+  {'id': 156, 'name': 'Strawberry'},
+  {'id': 157, 'name': 'Other Balls'},
+  {'id': 158, 'name': 'Shovel'},
+  {'id': 159, 'name': 'Pepper'},
+  {'id': 160, 'name': 'Computer Box'},
+  {'id': 161, 'name': 'Toilet Paper'},
+  {'id': 162, 'name': 'Cleaning Products'},
+  {'id': 163, 'name': 'Chopsticks'},
+  {'id': 164, 'name': 'Microwave'},
+  {'id': 165, 'name': 'Pigeon'},
+  {'id': 166, 'name': 'Baseball'},
+  {'id': 167, 'name': 'Cutting/chopping Board'},
+  {'id': 168, 'name': 'Coffee Table'},
+  {'id': 169, 'name': 'Side Table'},
+  {'id': 170, 'name': 'Scissors'},
+  {'id': 171, 'name': 'Marker'},
+  {'id': 172, 'name': 'Pie'},
+  {'id': 173, 'name': 'Ladder'},
+  {'id': 174, 'name': 'Snowboard'},
+  {'id': 175, 'name': 'Cookies'},
+  {'id': 176, 'name': 'Radiator'},
+  {'id': 177, 'name': 'Fire Hydrant'},
+  {'id': 178, 'name': 'Basketball'},
+  {'id': 179, 'name': 'Zebra'},
+  {'id': 180, 'name': 'Grape'},
+  {'id': 181, 'name': 'Giraffe'},
+  {'id': 182, 'name': 'Potato'},
+  {'id': 183, 'name': 'Sausage'},
+  {'id': 184, 'name': 'Tricycle'},
+  {'id': 185, 'name': 'Violin'},
+  {'id': 186, 'name': 'Egg'},
+  {'id': 187, 'name': 'Fire Extinguisher'},
+  {'id': 188, 'name': 'Candy'},
+  {'id': 189, 'name': 'Fire Truck'},
+  {'id': 190, 'name': 'Billards'},
+  {'id': 191, 'name': 'Converter'},
+  {'id': 192, 'name': 'Bathtub'},
+  {'id': 193, 'name': 'Wheelchair'},
+  {'id': 194, 'name': 'Golf Club'},
+  {'id': 195, 'name': 'Briefcase'},
+  {'id': 196, 'name': 'Cucumber'},
+  {'id': 197, 'name': 'Cigar/Cigarette '},
+  {'id': 198, 'name': 'Paint Brush'},
+  {'id': 199, 'name': 'Pear'},
+  {'id': 200, 'name': 'Heavy Truck'},
+  {'id': 201, 'name': 'Hamburger'},
+  {'id': 202, 'name': 'Extractor'},
+  {'id': 203, 'name': 'Extension Cord'},
+  {'id': 204, 'name': 'Tong'},
+  {'id': 205, 'name': 'Tennis Racket'},
+  {'id': 206, 'name': 'Folder'},
+  {'id': 207, 'name': 'American Football'},
+  {'id': 208, 'name': 'earphone'},
+  {'id': 209, 'name': 'Mask'},
+  {'id': 210, 'name': 'Kettle'},
+  {'id': 211, 'name': 'Tennis'},
+  {'id': 212, 'name': 'Ship'},
+  {'id': 213, 'name': 'Swing'},
+  {'id': 214, 'name': 'Coffee Machine'},
+  {'id': 215, 'name': 'Slide'},
+  {'id': 216, 'name': 'Carriage'},
+  {'id': 217, 'name': 'Onion'},
+  {'id': 218, 'name': 'Green beans'},
+  {'id': 219, 'name': 'Projector'},
+  {'id': 220, 'name': 'Frisbee'},
+  {'id': 221, 'name': 'Washing Machine/Drying Machine'},
+  {'id': 222, 'name': 'Chicken'},
+  {'id': 223, 'name': 'Printer'},
+  {'id': 224, 'name': 'Watermelon'},
+  {'id': 225, 'name': 'Saxophone'},
+  {'id': 226, 'name': 'Tissue'},
+  {'id': 227, 'name': 'Toothbrush'},
+  {'id': 228, 'name': 'Ice cream'},
+  {'id': 229, 'name': 'Hot air balloon'},
+  {'id': 230, 'name': 'Cello'},
+  {'id': 231, 'name': 'French Fries'},
+  {'id': 232, 'name': 'Scale'},
+  {'id': 233, 'name': 'Trophy'},
+  {'id': 234, 'name': 'Cabbage'},
+  {'id': 235, 'name': 'Hot dog'},
+  {'id': 236, 'name': 'Blender'},
+  {'id': 237, 'name': 'Peach'},
+  {'id': 238, 'name': 'Rice'},
+  {'id': 239, 'name': 'Wallet/Purse'},
+  {'id': 240, 'name': 'Volleyball'},
+  {'id': 241, 'name': 'Deer'},
+  {'id': 242, 'name': 'Goose'},
+  {'id': 243, 'name': 'Tape'},
+  {'id': 244, 'name': 'Tablet'},
+  {'id': 245, 'name': 'Cosmetics'},
+  {'id': 246, 'name': 'Trumpet'},
+  {'id': 247, 'name': 'Pineapple'},
+  {'id': 248, 'name': 'Golf Ball'},
+  {'id': 249, 'name': 'Ambulance'},
+  {'id': 250, 'name': 'Parking meter'},
+  {'id': 251, 'name': 'Mango'},
+  {'id': 252, 'name': 'Key'},
+  {'id': 253, 'name': 'Hurdle'},
+  {'id': 254, 'name': 'Fishing Rod'},
+  {'id': 255, 'name': 'Medal'},
+  {'id': 256, 'name': 'Flute'},
+  {'id': 257, 'name': 'Brush'},
+  {'id': 258, 'name': 'Penguin'},
+  {'id': 259, 'name': 'Megaphone'},
+  {'id': 260, 'name': 'Corn'},
+  {'id': 261, 'name': 'Lettuce'},
+  {'id': 262, 'name': 'Garlic'},
+  {'id': 263, 'name': 'Swan'},
+  {'id': 264, 'name': 'Helicopter'},
+  {'id': 265, 'name': 'Green Onion'},
+  {'id': 266, 'name': 'Sandwich'},
+  {'id': 267, 'name': 'Nuts'},
+  {'id': 268, 'name': 'Speed Limit Sign'},
+  {'id': 269, 'name': 'Induction Cooker'},
+  {'id': 270, 'name': 'Broom'},
+  {'id': 271, 'name': 'Trombone'},
+  {'id': 272, 'name': 'Plum'},
+  {'id': 273, 'name': 'Rickshaw'},
+  {'id': 274, 'name': 'Goldfish'},
+  {'id': 275, 'name': 'Kiwi fruit'},
+  {'id': 276, 'name': 'Router/modem'},
+  {'id': 277, 'name': 'Poker Card'},
+  {'id': 278, 'name': 'Toaster'},
+  {'id': 279, 'name': 'Shrimp'},
+  {'id': 280, 'name': 'Sushi'},
+  {'id': 281, 'name': 'Cheese'},
+  {'id': 282, 'name': 'Notepaper'},
+  {'id': 283, 'name': 'Cherry'},
+  {'id': 284, 'name': 'Pliers'},
+  {'id': 285, 'name': 'CD'},
+  {'id': 286, 'name': 'Pasta'},
+  {'id': 287, 'name': 'Hammer'},
+  {'id': 288, 'name': 'Cue'},
+  {'id': 289, 'name': 'Avocado'},
+  {'id': 290, 'name': 'Hami melon'},
+  {'id': 291, 'name': 'Flask'},
+  {'id': 292, 'name': 'Mushroom'},
+  {'id': 293, 'name': 'Screwdriver'},
+  {'id': 294, 'name': 'Soap'},
+  {'id': 295, 'name': 'Recorder'},
+  {'id': 296, 'name': 'Bear'},
+  {'id': 297, 'name': 'Eggplant'},
+  {'id': 298, 'name': 'Board Eraser'},
+  {'id': 299, 'name': 'Coconut'},
+  {'id': 300, 'name': 'Tape Measure/ Ruler'},
+  {'id': 301, 'name': 'Pig'},
+  {'id': 302, 'name': 'Showerhead'},
+  {'id': 303, 'name': 'Globe'},
+  {'id': 304, 'name': 'Chips'},
+  {'id': 305, 'name': 'Steak'},
+  {'id': 306, 'name': 'Crosswalk Sign'},
+  {'id': 307, 'name': 'Stapler'},
+  {'id': 308, 'name': 'Camel'},
+  {'id': 309, 'name': 'Formula 1 '},
+  {'id': 310, 'name': 'Pomegranate'},
+  {'id': 311, 'name': 'Dishwasher'},
+  {'id': 312, 'name': 'Crab'},
+  {'id': 313, 'name': 'Hoverboard'},
+  {'id': 314, 'name': 'Meatball'},
+  {'id': 315, 'name': 'Rice Cooker'},
+  {'id': 316, 'name': 'Tuba'},
+  {'id': 317, 'name': 'Calculator'},
+  {'id': 318, 'name': 'Papaya'},
+  {'id': 319, 'name': 'Antelope'},
+  {'id': 320, 'name': 'Parrot'},
+  {'id': 321, 'name': 'Seal'},
+  {'id': 322, 'name': 'Butterfly'},
+  {'id': 323, 'name': 'Dumbbell'},
+  {'id': 324, 'name': 'Donkey'},
+  {'id': 325, 'name': 'Lion'},
+  {'id': 326, 'name': 'Urinal'},
+  {'id': 327, 'name': 'Dolphin'},
+  {'id': 328, 'name': 'Electric Drill'},
+  {'id': 329, 'name': 'Hair Dryer'},
+  {'id': 330, 'name': 'Egg tart'},
+  {'id': 331, 'name': 'Jellyfish'},
+  {'id': 332, 'name': 'Treadmill'},
+  {'id': 333, 'name': 'Lighter'},
+  {'id': 334, 'name': 'Grapefruit'},
+  {'id': 335, 'name': 'Game board'},
+  {'id': 336, 'name': 'Mop'},
+  {'id': 337, 'name': 'Radish'},
+  {'id': 338, 'name': 'Baozi'},
+  {'id': 339, 'name': 'Target'},
+  {'id': 340, 'name': 'French'},
+  {'id': 341, 'name': 'Spring Rolls'},
+  {'id': 342, 'name': 'Monkey'},
+  {'id': 343, 'name': 'Rabbit'},
+  {'id': 344, 'name': 'Pencil Case'},
+  {'id': 345, 'name': 'Yak'},
+  {'id': 346, 'name': 'Red Cabbage'},
+  {'id': 347, 'name': 'Binoculars'},
+  {'id': 348, 'name': 'Asparagus'},
+  {'id': 349, 'name': 'Barbell'},
+  {'id': 350, 'name': 'Scallop'},
+  {'id': 351, 'name': 'Noddles'},
+  {'id': 352, 'name': 'Comb'},
+  {'id': 353, 'name': 'Dumpling'},
+  {'id': 354, 'name': 'Oyster'},
+  {'id': 355, 'name': 'Table Tennis paddle'},
+  {'id': 356, 'name': 'Cosmetics Brush/Eyeliner Pencil'},
+  {'id': 357, 'name': 'Chainsaw'},
+  {'id': 358, 'name': 'Eraser'},
+  {'id': 359, 'name': 'Lobster'},
+  {'id': 360, 'name': 'Durian'},
+  {'id': 361, 'name': 'Okra'},
+  {'id': 362, 'name': 'Lipstick'},
+  {'id': 363, 'name': 'Cosmetics Mirror'},
+  {'id': 364, 'name': 'Curling'},
+  {'id': 365, 'name': 'Table Tennis '},
+]
+def _get_builtin_metadata():
+    id_to_name = {x['id']: x['name'] for x in categories_v2_fix}
+    thing_dataset_id_to_contiguous_id = {
+        x['id']: i for i, x in enumerate(
+            sorted(categories_v2_fix, key=lambda x: x['id']))}
+    thing_classes = [id_to_name[k] for k in sorted(id_to_name)]
+    return {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes}
+_PREDEFINED_SPLITS_OBJECTS365 = {
+    "objects365_v2_train": ("objects365/train", "objects365/annotations/zhiyuan_objv2_train_fixname_fixmiss.json"),
+    # 80,000 images, 1,240,587 annotations
+    "objects365_v2_val": ("objects365/val", "objects365/annotations/zhiyuan_objv2_val_fixname.json"),
+    "objects365_v2_val_rare": ("objects365/val", "objects365/annotations/zhiyuan_objv2_val_fixname_rare.json"),
+}
+for key, (image_root, json_file) in _PREDEFINED_SPLITS_OBJECTS365.items():
+    register_coco_instances(
+        key,
+        _get_builtin_metadata(),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )

proxydet/data/datasets/oid.py ADDED Viewed

	@@ -0,0 +1,535 @@

+# Part of the code is from https://github.com/xingyizhou/UniDet/blob/master/projects/UniDet/unidet/data/datasets/oid.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .register_oid import register_oid_instances
+import os
+categories = [
+    {'id': 1, 'name': 'Infant bed', 'freebase_id': '/m/061hd_'},
+    {'id': 2, 'name': 'Rose', 'freebase_id': '/m/06m11'},
+    {'id': 3, 'name': 'Flag', 'freebase_id': '/m/03120'},
+    {'id': 4, 'name': 'Flashlight', 'freebase_id': '/m/01kb5b'},
+    {'id': 5, 'name': 'Sea turtle', 'freebase_id': '/m/0120dh'},
+    {'id': 6, 'name': 'Camera', 'freebase_id': '/m/0dv5r'},
+    {'id': 7, 'name': 'Animal', 'freebase_id': '/m/0jbk'},
+    {'id': 8, 'name': 'Glove', 'freebase_id': '/m/0174n1'},
+    {'id': 9, 'name': 'Crocodile', 'freebase_id': '/m/09f_2'},
+    {'id': 10, 'name': 'Cattle', 'freebase_id': '/m/01xq0k1'},
+    {'id': 11, 'name': 'House', 'freebase_id': '/m/03jm5'},
+    {'id': 12, 'name': 'Guacamole', 'freebase_id': '/m/02g30s'},
+    {'id': 13, 'name': 'Penguin', 'freebase_id': '/m/05z6w'},
+    {'id': 14, 'name': 'Vehicle registration plate', 'freebase_id': '/m/01jfm_'},
+    {'id': 15, 'name': 'Bench', 'freebase_id': '/m/076lb9'},
+    {'id': 16, 'name': 'Ladybug', 'freebase_id': '/m/0gj37'},
+    {'id': 17, 'name': 'Human nose', 'freebase_id': '/m/0k0pj'},
+    {'id': 18, 'name': 'Watermelon', 'freebase_id': '/m/0kpqd'},
+    {'id': 19, 'name': 'Flute', 'freebase_id': '/m/0l14j_'},
+    {'id': 20, 'name': 'Butterfly', 'freebase_id': '/m/0cyf8'},
+    {'id': 21, 'name': 'Washing machine', 'freebase_id': '/m/0174k2'},
+    {'id': 22, 'name': 'Raccoon', 'freebase_id': '/m/0dq75'},
+    {'id': 23, 'name': 'Segway', 'freebase_id': '/m/076bq'},
+    {'id': 24, 'name': 'Taco', 'freebase_id': '/m/07crc'},
+    {'id': 25, 'name': 'Jellyfish', 'freebase_id': '/m/0d8zb'},
+    {'id': 26, 'name': 'Cake', 'freebase_id': '/m/0fszt'},
+    {'id': 27, 'name': 'Pen', 'freebase_id': '/m/0k1tl'},
+    {'id': 28, 'name': 'Cannon', 'freebase_id': '/m/020kz'},
+    {'id': 29, 'name': 'Bread', 'freebase_id': '/m/09728'},
+    {'id': 30, 'name': 'Tree', 'freebase_id': '/m/07j7r'},
+    {'id': 31, 'name': 'Shellfish', 'freebase_id': '/m/0fbdv'},
+    {'id': 32, 'name': 'Bed', 'freebase_id': '/m/03ssj5'},
+    {'id': 33, 'name': 'Hamster', 'freebase_id': '/m/03qrc'},
+    {'id': 34, 'name': 'Hat', 'freebase_id': '/m/02dl1y'},
+    {'id': 35, 'name': 'Toaster', 'freebase_id': '/m/01k6s3'},
+    {'id': 36, 'name': 'Sombrero', 'freebase_id': '/m/02jfl0'},
+    {'id': 37, 'name': 'Tiara', 'freebase_id': '/m/01krhy'},
+    {'id': 38, 'name': 'Bowl', 'freebase_id': '/m/04kkgm'},
+    {'id': 39, 'name': 'Dragonfly', 'freebase_id': '/m/0ft9s'},
+    {'id': 40, 'name': 'Moths and butterflies', 'freebase_id': '/m/0d_2m'},
+    {'id': 41, 'name': 'Antelope', 'freebase_id': '/m/0czz2'},
+    {'id': 42, 'name': 'Vegetable', 'freebase_id': '/m/0f4s2w'},
+    {'id': 43, 'name': 'Torch', 'freebase_id': '/m/07dd4'},
+    {'id': 44, 'name': 'Building', 'freebase_id': '/m/0cgh4'},
+    {'id': 45, 'name': 'Power plugs and sockets', 'freebase_id': '/m/03bbps'},
+    {'id': 46, 'name': 'Blender', 'freebase_id': '/m/02pjr4'},
+    {'id': 47, 'name': 'Billiard table', 'freebase_id': '/m/04p0qw'},
+    {'id': 48, 'name': 'Cutting board', 'freebase_id': '/m/02pdsw'},
+    {'id': 49, 'name': 'Bronze sculpture', 'freebase_id': '/m/01yx86'},
+    {'id': 50, 'name': 'Turtle', 'freebase_id': '/m/09dzg'},
+    {'id': 51, 'name': 'Broccoli', 'freebase_id': '/m/0hkxq'},
+    {'id': 52, 'name': 'Tiger', 'freebase_id': '/m/07dm6'},
+    {'id': 53, 'name': 'Mirror', 'freebase_id': '/m/054_l'},
+    {'id': 54, 'name': 'Bear', 'freebase_id': '/m/01dws'},
+    {'id': 55, 'name': 'Zucchini', 'freebase_id': '/m/027pcv'},
+    {'id': 56, 'name': 'Dress', 'freebase_id': '/m/01d40f'},
+    {'id': 57, 'name': 'Volleyball', 'freebase_id': '/m/02rgn06'},
+    {'id': 58, 'name': 'Guitar', 'freebase_id': '/m/0342h'},
+    {'id': 59, 'name': 'Reptile', 'freebase_id': '/m/06bt6'},
+    {'id': 60, 'name': 'Golf cart', 'freebase_id': '/m/0323sq'},
+    {'id': 61, 'name': 'Tart', 'freebase_id': '/m/02zvsm'},
+    {'id': 62, 'name': 'Fedora', 'freebase_id': '/m/02fq_6'},
+    {'id': 63, 'name': 'Carnivore', 'freebase_id': '/m/01lrl'},
+    {'id': 64, 'name': 'Car', 'freebase_id': '/m/0k4j'},
+    {'id': 65, 'name': 'Lighthouse', 'freebase_id': '/m/04h7h'},
+    {'id': 66, 'name': 'Coffeemaker', 'freebase_id': '/m/07xyvk'},
+    {'id': 67, 'name': 'Food processor', 'freebase_id': '/m/03y6mg'},
+    {'id': 68, 'name': 'Truck', 'freebase_id': '/m/07r04'},
+    {'id': 69, 'name': 'Bookcase', 'freebase_id': '/m/03__z0'},
+    {'id': 70, 'name': 'Surfboard', 'freebase_id': '/m/019w40'},
+    {'id': 71, 'name': 'Footwear', 'freebase_id': '/m/09j5n'},
+    {'id': 72, 'name': 'Bench', 'freebase_id': '/m/0cvnqh'},
+    {'id': 73, 'name': 'Necklace', 'freebase_id': '/m/01llwg'},
+    {'id': 74, 'name': 'Flower', 'freebase_id': '/m/0c9ph5'},
+    {'id': 75, 'name': 'Radish', 'freebase_id': '/m/015x5n'},
+    {'id': 76, 'name': 'Marine mammal', 'freebase_id': '/m/0gd2v'},
+    {'id': 77, 'name': 'Frying pan', 'freebase_id': '/m/04v6l4'},
+    {'id': 78, 'name': 'Tap', 'freebase_id': '/m/02jz0l'},
+    {'id': 79, 'name': 'Peach', 'freebase_id': '/m/0dj6p'},
+    {'id': 80, 'name': 'Knife', 'freebase_id': '/m/04ctx'},
+    {'id': 81, 'name': 'Handbag', 'freebase_id': '/m/080hkjn'},
+    {'id': 82, 'name': 'Laptop', 'freebase_id': '/m/01c648'},
+    {'id': 83, 'name': 'Tent', 'freebase_id': '/m/01j61q'},
+    {'id': 84, 'name': 'Ambulance', 'freebase_id': '/m/012n7d'},
+    {'id': 85, 'name': 'Christmas tree', 'freebase_id': '/m/025nd'},
+    {'id': 86, 'name': 'Eagle', 'freebase_id': '/m/09csl'},
+    {'id': 87, 'name': 'Limousine', 'freebase_id': '/m/01lcw4'},
+    {'id': 88, 'name': 'Kitchen & dining room table', 'freebase_id': '/m/0h8n5zk'},
+    {'id': 89, 'name': 'Polar bear', 'freebase_id': '/m/0633h'},
+    {'id': 90, 'name': 'Tower', 'freebase_id': '/m/01fdzj'},
+    {'id': 91, 'name': 'Football', 'freebase_id': '/m/01226z'},
+    {'id': 92, 'name': 'Willow', 'freebase_id': '/m/0mw_6'},
+    {'id': 93, 'name': 'Human head', 'freebase_id': '/m/04hgtk'},
+    {'id': 94, 'name': 'Stop sign', 'freebase_id': '/m/02pv19'},
+    {'id': 95, 'name': 'Banana', 'freebase_id': '/m/09qck'},
+    {'id': 96, 'name': 'Mixer', 'freebase_id': '/m/063rgb'},
+    {'id': 97, 'name': 'Binoculars', 'freebase_id': '/m/0lt4_'},
+    {'id': 98, 'name': 'Dessert', 'freebase_id': '/m/0270h'},
+    {'id': 99, 'name': 'Bee', 'freebase_id': '/m/01h3n'},
+    {'id': 100, 'name': 'Chair', 'freebase_id': '/m/01mzpv'},
+    {'id': 101, 'name': 'Wood-burning stove', 'freebase_id': '/m/04169hn'},
+    {'id': 102, 'name': 'Flowerpot', 'freebase_id': '/m/0fm3zh'},
+    {'id': 103, 'name': 'Beaker', 'freebase_id': '/m/0d20w4'},
+    {'id': 104, 'name': 'Oyster', 'freebase_id': '/m/0_cp5'},
+    {'id': 105, 'name': 'Woodpecker', 'freebase_id': '/m/01dy8n'},
+    {'id': 106, 'name': 'Harp', 'freebase_id': '/m/03m5k'},
+    {'id': 107, 'name': 'Bathtub', 'freebase_id': '/m/03dnzn'},
+    {'id': 108, 'name': 'Wall clock', 'freebase_id': '/m/0h8mzrc'},
+    {'id': 109, 'name': 'Sports uniform', 'freebase_id': '/m/0h8mhzd'},
+    {'id': 110, 'name': 'Rhinoceros', 'freebase_id': '/m/03d443'},
+    {'id': 111, 'name': 'Beehive', 'freebase_id': '/m/01gllr'},
+    {'id': 112, 'name': 'Cupboard', 'freebase_id': '/m/0642b4'},
+    {'id': 113, 'name': 'Chicken', 'freebase_id': '/m/09b5t'},
+    {'id': 114, 'name': 'Man', 'freebase_id': '/m/04yx4'},
+    {'id': 115, 'name': 'Blue jay', 'freebase_id': '/m/01f8m5'},
+    {'id': 116, 'name': 'Cucumber', 'freebase_id': '/m/015x4r'},
+    {'id': 117, 'name': 'Balloon', 'freebase_id': '/m/01j51'},
+    {'id': 118, 'name': 'Kite', 'freebase_id': '/m/02zt3'},
+    {'id': 119, 'name': 'Fireplace', 'freebase_id': '/m/03tw93'},
+    {'id': 120, 'name': 'Lantern', 'freebase_id': '/m/01jfsr'},
+    {'id': 121, 'name': 'Missile', 'freebase_id': '/m/04ylt'},
+    {'id': 122, 'name': 'Book', 'freebase_id': '/m/0bt_c3'},
+    {'id': 123, 'name': 'Spoon', 'freebase_id': '/m/0cmx8'},
+    {'id': 124, 'name': 'Grapefruit', 'freebase_id': '/m/0hqkz'},
+    {'id': 125, 'name': 'Squirrel', 'freebase_id': '/m/071qp'},
+    {'id': 126, 'name': 'Orange', 'freebase_id': '/m/0cyhj_'},
+    {'id': 127, 'name': 'Coat', 'freebase_id': '/m/01xygc'},
+    {'id': 128, 'name': 'Punching bag', 'freebase_id': '/m/0420v5'},
+    {'id': 129, 'name': 'Zebra', 'freebase_id': '/m/0898b'},
+    {'id': 130, 'name': 'Billboard', 'freebase_id': '/m/01knjb'},
+    {'id': 131, 'name': 'Bicycle', 'freebase_id': '/m/0199g'},
+    {'id': 132, 'name': 'Door handle', 'freebase_id': '/m/03c7gz'},
+    {'id': 133, 'name': 'Mechanical fan', 'freebase_id': '/m/02x984l'},
+    {'id': 134, 'name': 'Ring binder', 'freebase_id': '/m/04zwwv'},
+    {'id': 135, 'name': 'Table', 'freebase_id': '/m/04bcr3'},
+    {'id': 136, 'name': 'Parrot', 'freebase_id': '/m/0gv1x'},
+    {'id': 137, 'name': 'Sock', 'freebase_id': '/m/01nq26'},
+    {'id': 138, 'name': 'Vase', 'freebase_id': '/m/02s195'},
+    {'id': 139, 'name': 'Weapon', 'freebase_id': '/m/083kb'},
+    {'id': 140, 'name': 'Shotgun', 'freebase_id': '/m/06nrc'},
+    {'id': 141, 'name': 'Glasses', 'freebase_id': '/m/0jyfg'},
+    {'id': 142, 'name': 'Seahorse', 'freebase_id': '/m/0nybt'},
+    {'id': 143, 'name': 'Belt', 'freebase_id': '/m/0176mf'},
+    {'id': 144, 'name': 'Watercraft', 'freebase_id': '/m/01rzcn'},
+    {'id': 145, 'name': 'Window', 'freebase_id': '/m/0d4v4'},
+    {'id': 146, 'name': 'Giraffe', 'freebase_id': '/m/03bk1'},
+    {'id': 147, 'name': 'Lion', 'freebase_id': '/m/096mb'},
+    {'id': 148, 'name': 'Tire', 'freebase_id': '/m/0h9mv'},
+    {'id': 149, 'name': 'Vehicle', 'freebase_id': '/m/07yv9'},
+    {'id': 150, 'name': 'Canoe', 'freebase_id': '/m/0ph39'},
+    {'id': 151, 'name': 'Tie', 'freebase_id': '/m/01rkbr'},
+    {'id': 152, 'name': 'Shelf', 'freebase_id': '/m/0gjbg72'},
+    {'id': 153, 'name': 'Picture frame', 'freebase_id': '/m/06z37_'},
+    {'id': 154, 'name': 'Printer', 'freebase_id': '/m/01m4t'},
+    {'id': 155, 'name': 'Human leg', 'freebase_id': '/m/035r7c'},
+    {'id': 156, 'name': 'Boat', 'freebase_id': '/m/019jd'},
+    {'id': 157, 'name': 'Slow cooker', 'freebase_id': '/m/02tsc9'},
+    {'id': 158, 'name': 'Croissant', 'freebase_id': '/m/015wgc'},
+    {'id': 159, 'name': 'Candle', 'freebase_id': '/m/0c06p'},
+    {'id': 160, 'name': 'Pancake', 'freebase_id': '/m/01dwwc'},
+    {'id': 161, 'name': 'Pillow', 'freebase_id': '/m/034c16'},
+    {'id': 162, 'name': 'Coin', 'freebase_id': '/m/0242l'},
+    {'id': 163, 'name': 'Stretcher', 'freebase_id': '/m/02lbcq'},
+    {'id': 164, 'name': 'Sandal', 'freebase_id': '/m/03nfch'},
+    {'id': 165, 'name': 'Woman', 'freebase_id': '/m/03bt1vf'},
+    {'id': 166, 'name': 'Stairs', 'freebase_id': '/m/01lynh'},
+    {'id': 167, 'name': 'Harpsichord', 'freebase_id': '/m/03q5t'},
+    {'id': 168, 'name': 'Stool', 'freebase_id': '/m/0fqt361'},
+    {'id': 169, 'name': 'Bus', 'freebase_id': '/m/01bjv'},
+    {'id': 170, 'name': 'Suitcase', 'freebase_id': '/m/01s55n'},
+    {'id': 171, 'name': 'Human mouth', 'freebase_id': '/m/0283dt1'},
+    {'id': 172, 'name': 'Juice', 'freebase_id': '/m/01z1kdw'},
+    {'id': 173, 'name': 'Skull', 'freebase_id': '/m/016m2d'},
+    {'id': 174, 'name': 'Door', 'freebase_id': '/m/02dgv'},
+    {'id': 175, 'name': 'Violin', 'freebase_id': '/m/07y_7'},
+    {'id': 176, 'name': 'Chopsticks', 'freebase_id': '/m/01_5g'},
+    {'id': 177, 'name': 'Digital clock', 'freebase_id': '/m/06_72j'},
+    {'id': 178, 'name': 'Sunflower', 'freebase_id': '/m/0ftb8'},
+    {'id': 179, 'name': 'Leopard', 'freebase_id': '/m/0c29q'},
+    {'id': 180, 'name': 'Bell pepper', 'freebase_id': '/m/0jg57'},
+    {'id': 181, 'name': 'Harbor seal', 'freebase_id': '/m/02l8p9'},
+    {'id': 182, 'name': 'Snake', 'freebase_id': '/m/078jl'},
+    {'id': 183, 'name': 'Sewing machine', 'freebase_id': '/m/0llzx'},
+    {'id': 184, 'name': 'Goose', 'freebase_id': '/m/0dbvp'},
+    {'id': 185, 'name': 'Helicopter', 'freebase_id': '/m/09ct_'},
+    {'id': 186, 'name': 'Seat belt', 'freebase_id': '/m/0dkzw'},
+    {'id': 187, 'name': 'Coffee cup', 'freebase_id': '/m/02p5f1q'},
+    {'id': 188, 'name': 'Microwave oven', 'freebase_id': '/m/0fx9l'},
+    {'id': 189, 'name': 'Hot dog', 'freebase_id': '/m/01b9xk'},
+    {'id': 190, 'name': 'Countertop', 'freebase_id': '/m/0b3fp9'},
+    {'id': 191, 'name': 'Serving tray', 'freebase_id': '/m/0h8n27j'},
+    {'id': 192, 'name': 'Dog bed', 'freebase_id': '/m/0h8n6f9'},
+    {'id': 193, 'name': 'Beer', 'freebase_id': '/m/01599'},
+    {'id': 194, 'name': 'Sunglasses', 'freebase_id': '/m/017ftj'},
+    {'id': 195, 'name': 'Golf ball', 'freebase_id': '/m/044r5d'},
+    {'id': 196, 'name': 'Waffle', 'freebase_id': '/m/01dwsz'},
+    {'id': 197, 'name': 'Palm tree', 'freebase_id': '/m/0cdl1'},
+    {'id': 198, 'name': 'Trumpet', 'freebase_id': '/m/07gql'},
+    {'id': 199, 'name': 'Ruler', 'freebase_id': '/m/0hdln'},
+    {'id': 200, 'name': 'Helmet', 'freebase_id': '/m/0zvk5'},
+    {'id': 201, 'name': 'Ladder', 'freebase_id': '/m/012w5l'},
+    {'id': 202, 'name': 'Office building', 'freebase_id': '/m/021sj1'},
+    {'id': 203, 'name': 'Tablet computer', 'freebase_id': '/m/0bh9flk'},
+    {'id': 204, 'name': 'Toilet paper', 'freebase_id': '/m/09gtd'},
+    {'id': 205, 'name': 'Pomegranate', 'freebase_id': '/m/0jwn_'},
+    {'id': 206, 'name': 'Skirt', 'freebase_id': '/m/02wv6h6'},
+    {'id': 207, 'name': 'Gas stove', 'freebase_id': '/m/02wv84t'},
+    {'id': 208, 'name': 'Cookie', 'freebase_id': '/m/021mn'},
+    {'id': 209, 'name': 'Cart', 'freebase_id': '/m/018p4k'},
+    {'id': 210, 'name': 'Raven', 'freebase_id': '/m/06j2d'},
+    {'id': 211, 'name': 'Egg', 'freebase_id': '/m/033cnk'},
+    {'id': 212, 'name': 'Burrito', 'freebase_id': '/m/01j3zr'},
+    {'id': 213, 'name': 'Goat', 'freebase_id': '/m/03fwl'},
+    {'id': 214, 'name': 'Kitchen knife', 'freebase_id': '/m/058qzx'},
+    {'id': 215, 'name': 'Skateboard', 'freebase_id': '/m/06_fw'},
+    {'id': 216, 'name': 'Salt and pepper shakers', 'freebase_id': '/m/02x8cch'},
+    {'id': 217, 'name': 'Lynx', 'freebase_id': '/m/04g2r'},
+    {'id': 218, 'name': 'Boot', 'freebase_id': '/m/01b638'},
+    {'id': 219, 'name': 'Platter', 'freebase_id': '/m/099ssp'},
+    {'id': 220, 'name': 'Ski', 'freebase_id': '/m/071p9'},
+    {'id': 221, 'name': 'Swimwear', 'freebase_id': '/m/01gkx_'},
+    {'id': 222, 'name': 'Swimming pool', 'freebase_id': '/m/0b_rs'},
+    {'id': 223, 'name': 'Drinking straw', 'freebase_id': '/m/03v5tg'},
+    {'id': 224, 'name': 'Wrench', 'freebase_id': '/m/01j5ks'},
+    {'id': 225, 'name': 'Drum', 'freebase_id': '/m/026t6'},
+    {'id': 226, 'name': 'Ant', 'freebase_id': '/m/0_k2'},
+    {'id': 227, 'name': 'Human ear', 'freebase_id': '/m/039xj_'},
+    {'id': 228, 'name': 'Headphones', 'freebase_id': '/m/01b7fy'},
+    {'id': 229, 'name': 'Fountain', 'freebase_id': '/m/0220r2'},
+    {'id': 230, 'name': 'Bird', 'freebase_id': '/m/015p6'},
+    {'id': 231, 'name': 'Jeans', 'freebase_id': '/m/0fly7'},
+    {'id': 232, 'name': 'Television', 'freebase_id': '/m/07c52'},
+    {'id': 233, 'name': 'Crab', 'freebase_id': '/m/0n28_'},
+    {'id': 234, 'name': 'Microphone', 'freebase_id': '/m/0hg7b'},
+    {'id': 235, 'name': 'Home appliance', 'freebase_id': '/m/019dx1'},
+    {'id': 236, 'name': 'Snowplow', 'freebase_id': '/m/04vv5k'},
+    {'id': 237, 'name': 'Beetle', 'freebase_id': '/m/020jm'},
+    {'id': 238, 'name': 'Artichoke', 'freebase_id': '/m/047v4b'},
+    {'id': 239, 'name': 'Jet ski', 'freebase_id': '/m/01xs3r'},
+    {'id': 240, 'name': 'Stationary bicycle', 'freebase_id': '/m/03kt2w'},
+    {'id': 241, 'name': 'Human hair', 'freebase_id': '/m/03q69'},
+    {'id': 242, 'name': 'Brown bear', 'freebase_id': '/m/01dxs'},
+    {'id': 243, 'name': 'Starfish', 'freebase_id': '/m/01h8tj'},
+    {'id': 244, 'name': 'Fork', 'freebase_id': '/m/0dt3t'},
+    {'id': 245, 'name': 'Lobster', 'freebase_id': '/m/0cjq5'},
+    {'id': 246, 'name': 'Corded phone', 'freebase_id': '/m/0h8lkj8'},
+    {'id': 247, 'name': 'Drink', 'freebase_id': '/m/0271t'},
+    {'id': 248, 'name': 'Saucer', 'freebase_id': '/m/03q5c7'},
+    {'id': 249, 'name': 'Carrot', 'freebase_id': '/m/0fj52s'},
+    {'id': 250, 'name': 'Insect', 'freebase_id': '/m/03vt0'},
+    {'id': 251, 'name': 'Clock', 'freebase_id': '/m/01x3z'},
+    {'id': 252, 'name': 'Castle', 'freebase_id': '/m/0d5gx'},
+    {'id': 253, 'name': 'Tennis racket', 'freebase_id': '/m/0h8my_4'},
+    {'id': 254, 'name': 'Ceiling fan', 'freebase_id': '/m/03ldnb'},
+    {'id': 255, 'name': 'Asparagus', 'freebase_id': '/m/0cjs7'},
+    {'id': 256, 'name': 'Jaguar', 'freebase_id': '/m/0449p'},
+    {'id': 257, 'name': 'Musical instrument', 'freebase_id': '/m/04szw'},
+    {'id': 258, 'name': 'Train', 'freebase_id': '/m/07jdr'},
+    {'id': 259, 'name': 'Cat', 'freebase_id': '/m/01yrx'},
+    {'id': 260, 'name': 'Rifle', 'freebase_id': '/m/06c54'},
+    {'id': 261, 'name': 'Dumbbell', 'freebase_id': '/m/04h8sr'},
+    {'id': 262, 'name': 'Mobile phone', 'freebase_id': '/m/050k8'},
+    {'id': 263, 'name': 'Taxi', 'freebase_id': '/m/0pg52'},
+    {'id': 264, 'name': 'Shower', 'freebase_id': '/m/02f9f_'},
+    {'id': 265, 'name': 'Pitcher', 'freebase_id': '/m/054fyh'},
+    {'id': 266, 'name': 'Lemon', 'freebase_id': '/m/09k_b'},
+    {'id': 267, 'name': 'Invertebrate', 'freebase_id': '/m/03xxp'},
+    {'id': 268, 'name': 'Turkey', 'freebase_id': '/m/0jly1'},
+    {'id': 269, 'name': 'High heels', 'freebase_id': '/m/06k2mb'},
+    {'id': 270, 'name': 'Bust', 'freebase_id': '/m/04yqq2'},
+    {'id': 271, 'name': 'Elephant', 'freebase_id': '/m/0bwd_0j'},
+    {'id': 272, 'name': 'Scarf', 'freebase_id': '/m/02h19r'},
+    {'id': 273, 'name': 'Barrel', 'freebase_id': '/m/02zn6n'},
+    {'id': 274, 'name': 'Trombone', 'freebase_id': '/m/07c6l'},
+    {'id': 275, 'name': 'Pumpkin', 'freebase_id': '/m/05zsy'},
+    {'id': 276, 'name': 'Box', 'freebase_id': '/m/025dyy'},
+    {'id': 277, 'name': 'Tomato', 'freebase_id': '/m/07j87'},
+    {'id': 278, 'name': 'Frog', 'freebase_id': '/m/09ld4'},
+    {'id': 279, 'name': 'Bidet', 'freebase_id': '/m/01vbnl'},
+    {'id': 280, 'name': 'Human face', 'freebase_id': '/m/0dzct'},
+    {'id': 281, 'name': 'Houseplant', 'freebase_id': '/m/03fp41'},
+    {'id': 282, 'name': 'Van', 'freebase_id': '/m/0h2r6'},
+    {'id': 283, 'name': 'Shark', 'freebase_id': '/m/0by6g'},
+    {'id': 284, 'name': 'Ice cream', 'freebase_id': '/m/0cxn2'},
+    {'id': 285, 'name': 'Swim cap', 'freebase_id': '/m/04tn4x'},
+    {'id': 286, 'name': 'Falcon', 'freebase_id': '/m/0f6wt'},
+    {'id': 287, 'name': 'Ostrich', 'freebase_id': '/m/05n4y'},
+    {'id': 288, 'name': 'Handgun', 'freebase_id': '/m/0gxl3'},
+    {'id': 289, 'name': 'Whiteboard', 'freebase_id': '/m/02d9qx'},
+    {'id': 290, 'name': 'Lizard', 'freebase_id': '/m/04m9y'},
+    {'id': 291, 'name': 'Pasta', 'freebase_id': '/m/05z55'},
+    {'id': 292, 'name': 'Snowmobile', 'freebase_id': '/m/01x3jk'},
+    {'id': 293, 'name': 'Light bulb', 'freebase_id': '/m/0h8l4fh'},
+    {'id': 294, 'name': 'Window blind', 'freebase_id': '/m/031b6r'},
+    {'id': 295, 'name': 'Muffin', 'freebase_id': '/m/01tcjp'},
+    {'id': 296, 'name': 'Pretzel', 'freebase_id': '/m/01f91_'},
+    {'id': 297, 'name': 'Computer monitor', 'freebase_id': '/m/02522'},
+    {'id': 298, 'name': 'Horn', 'freebase_id': '/m/0319l'},
+    {'id': 299, 'name': 'Furniture', 'freebase_id': '/m/0c_jw'},
+    {'id': 300, 'name': 'Sandwich', 'freebase_id': '/m/0l515'},
+    {'id': 301, 'name': 'Fox', 'freebase_id': '/m/0306r'},
+    {'id': 302, 'name': 'Convenience store', 'freebase_id': '/m/0crjs'},
+    {'id': 303, 'name': 'Fish', 'freebase_id': '/m/0ch_cf'},
+    {'id': 304, 'name': 'Fruit', 'freebase_id': '/m/02xwb'},
+    {'id': 305, 'name': 'Earrings', 'freebase_id': '/m/01r546'},
+    {'id': 306, 'name': 'Curtain', 'freebase_id': '/m/03rszm'},
+    {'id': 307, 'name': 'Grape', 'freebase_id': '/m/0388q'},
+    {'id': 308, 'name': 'Sofa bed', 'freebase_id': '/m/03m3pdh'},
+    {'id': 309, 'name': 'Horse', 'freebase_id': '/m/03k3r'},
+    {'id': 310, 'name': 'Luggage and bags', 'freebase_id': '/m/0hf58v5'},
+    {'id': 311, 'name': 'Desk', 'freebase_id': '/m/01y9k5'},
+    {'id': 312, 'name': 'Crutch', 'freebase_id': '/m/05441v'},
+    {'id': 313, 'name': 'Bicycle helmet', 'freebase_id': '/m/03p3bw'},
+    {'id': 314, 'name': 'Tick', 'freebase_id': '/m/0175cv'},
+    {'id': 315, 'name': 'Airplane', 'freebase_id': '/m/0cmf2'},
+    {'id': 316, 'name': 'Canary', 'freebase_id': '/m/0ccs93'},
+    {'id': 317, 'name': 'Spatula', 'freebase_id': '/m/02d1br'},
+    {'id': 318, 'name': 'Watch', 'freebase_id': '/m/0gjkl'},
+    {'id': 319, 'name': 'Lily', 'freebase_id': '/m/0jqgx'},
+    {'id': 320, 'name': 'Kitchen appliance', 'freebase_id': '/m/0h99cwc'},
+    {'id': 321, 'name': 'Filing cabinet', 'freebase_id': '/m/047j0r'},
+    {'id': 322, 'name': 'Aircraft', 'freebase_id': '/m/0k5j'},
+    {'id': 323, 'name': 'Cake stand', 'freebase_id': '/m/0h8n6ft'},
+    {'id': 324, 'name': 'Candy', 'freebase_id': '/m/0gm28'},
+    {'id': 325, 'name': 'Sink', 'freebase_id': '/m/0130jx'},
+    {'id': 326, 'name': 'Mouse', 'freebase_id': '/m/04rmv'},
+    {'id': 327, 'name': 'Wine', 'freebase_id': '/m/081qc'},
+    {'id': 328, 'name': 'Wheelchair', 'freebase_id': '/m/0qmmr'},
+    {'id': 329, 'name': 'Goldfish', 'freebase_id': '/m/03fj2'},
+    {'id': 330, 'name': 'Refrigerator', 'freebase_id': '/m/040b_t'},
+    {'id': 331, 'name': 'French fries', 'freebase_id': '/m/02y6n'},
+    {'id': 332, 'name': 'Drawer', 'freebase_id': '/m/0fqfqc'},
+    {'id': 333, 'name': 'Treadmill', 'freebase_id': '/m/030610'},
+    {'id': 334, 'name': 'Picnic basket', 'freebase_id': '/m/07kng9'},
+    {'id': 335, 'name': 'Dice', 'freebase_id': '/m/029b3'},
+    {'id': 336, 'name': 'Cabbage', 'freebase_id': '/m/0fbw6'},
+    {'id': 337, 'name': 'Football helmet', 'freebase_id': '/m/07qxg_'},
+    {'id': 338, 'name': 'Pig', 'freebase_id': '/m/068zj'},
+    {'id': 339, 'name': 'Person', 'freebase_id': '/m/01g317'},
+    {'id': 340, 'name': 'Shorts', 'freebase_id': '/m/01bfm9'},
+    {'id': 341, 'name': 'Gondola', 'freebase_id': '/m/02068x'},
+    {'id': 342, 'name': 'Honeycomb', 'freebase_id': '/m/0fz0h'},
+    {'id': 343, 'name': 'Doughnut', 'freebase_id': '/m/0jy4k'},
+    {'id': 344, 'name': 'Chest of drawers', 'freebase_id': '/m/05kyg_'},
+    {'id': 345, 'name': 'Land vehicle', 'freebase_id': '/m/01prls'},
+    {'id': 346, 'name': 'Bat', 'freebase_id': '/m/01h44'},
+    {'id': 347, 'name': 'Monkey', 'freebase_id': '/m/08pbxl'},
+    {'id': 348, 'name': 'Dagger', 'freebase_id': '/m/02gzp'},
+    {'id': 349, 'name': 'Tableware', 'freebase_id': '/m/04brg2'},
+    {'id': 350, 'name': 'Human foot', 'freebase_id': '/m/031n1'},
+    {'id': 351, 'name': 'Mug', 'freebase_id': '/m/02jvh9'},
+    {'id': 352, 'name': 'Alarm clock', 'freebase_id': '/m/046dlr'},
+    {'id': 353, 'name': 'Pressure cooker', 'freebase_id': '/m/0h8ntjv'},
+    {'id': 354, 'name': 'Human hand', 'freebase_id': '/m/0k65p'},
+    {'id': 355, 'name': 'Tortoise', 'freebase_id': '/m/011k07'},
+    {'id': 356, 'name': 'Baseball glove', 'freebase_id': '/m/03grzl'},
+    {'id': 357, 'name': 'Sword', 'freebase_id': '/m/06y5r'},
+    {'id': 358, 'name': 'Pear', 'freebase_id': '/m/061_f'},
+    {'id': 359, 'name': 'Miniskirt', 'freebase_id': '/m/01cmb2'},
+    {'id': 360, 'name': 'Traffic sign', 'freebase_id': '/m/01mqdt'},
+    {'id': 361, 'name': 'Girl', 'freebase_id': '/m/05r655'},
+    {'id': 362, 'name': 'Roller skates', 'freebase_id': '/m/02p3w7d'},
+    {'id': 363, 'name': 'Dinosaur', 'freebase_id': '/m/029tx'},
+    {'id': 364, 'name': 'Porch', 'freebase_id': '/m/04m6gz'},
+    {'id': 365, 'name': 'Human beard', 'freebase_id': '/m/015h_t'},
+    {'id': 366, 'name': 'Submarine sandwich', 'freebase_id': '/m/06pcq'},
+    {'id': 367, 'name': 'Screwdriver', 'freebase_id': '/m/01bms0'},
+    {'id': 368, 'name': 'Strawberry', 'freebase_id': '/m/07fbm7'},
+    {'id': 369, 'name': 'Wine glass', 'freebase_id': '/m/09tvcd'},
+    {'id': 370, 'name': 'Seafood', 'freebase_id': '/m/06nwz'},
+    {'id': 371, 'name': 'Racket', 'freebase_id': '/m/0dv9c'},
+    {'id': 372, 'name': 'Wheel', 'freebase_id': '/m/083wq'},
+    {'id': 373, 'name': 'Sea lion', 'freebase_id': '/m/0gd36'},
+    {'id': 374, 'name': 'Toy', 'freebase_id': '/m/0138tl'},
+    {'id': 375, 'name': 'Tea', 'freebase_id': '/m/07clx'},
+    {'id': 376, 'name': 'Tennis ball', 'freebase_id': '/m/05ctyq'},
+    {'id': 377, 'name': 'Waste container', 'freebase_id': '/m/0bjyj5'},
+    {'id': 378, 'name': 'Mule', 'freebase_id': '/m/0dbzx'},
+    {'id': 379, 'name': 'Cricket ball', 'freebase_id': '/m/02ctlc'},
+    {'id': 380, 'name': 'Pineapple', 'freebase_id': '/m/0fp6w'},
+    {'id': 381, 'name': 'Coconut', 'freebase_id': '/m/0djtd'},
+    {'id': 382, 'name': 'Doll', 'freebase_id': '/m/0167gd'},
+    {'id': 383, 'name': 'Coffee table', 'freebase_id': '/m/078n6m'},
+    {'id': 384, 'name': 'Snowman', 'freebase_id': '/m/0152hh'},
+    {'id': 385, 'name': 'Lavender', 'freebase_id': '/m/04gth'},
+    {'id': 386, 'name': 'Shrimp', 'freebase_id': '/m/0ll1f78'},
+    {'id': 387, 'name': 'Maple', 'freebase_id': '/m/0cffdh'},
+    {'id': 388, 'name': 'Cowboy hat', 'freebase_id': '/m/025rp__'},
+    {'id': 389, 'name': 'Goggles', 'freebase_id': '/m/02_n6y'},
+    {'id': 390, 'name': 'Rugby ball', 'freebase_id': '/m/0wdt60w'},
+    {'id': 391, 'name': 'Caterpillar', 'freebase_id': '/m/0cydv'},
+    {'id': 392, 'name': 'Poster', 'freebase_id': '/m/01n5jq'},
+    {'id': 393, 'name': 'Rocket', 'freebase_id': '/m/09rvcxw'},
+    {'id': 394, 'name': 'Organ', 'freebase_id': '/m/013y1f'},
+    {'id': 395, 'name': 'Saxophone', 'freebase_id': '/m/06ncr'},
+    {'id': 396, 'name': 'Traffic light', 'freebase_id': '/m/015qff'},
+    {'id': 397, 'name': 'Cocktail', 'freebase_id': '/m/024g6'},
+    {'id': 398, 'name': 'Plastic bag', 'freebase_id': '/m/05gqfk'},
+    {'id': 399, 'name': 'Squash', 'freebase_id': '/m/0dv77'},
+    {'id': 400, 'name': 'Mushroom', 'freebase_id': '/m/052sf'},
+    {'id': 401, 'name': 'Hamburger', 'freebase_id': '/m/0cdn1'},
+    {'id': 402, 'name': 'Light switch', 'freebase_id': '/m/03jbxj'},
+    {'id': 403, 'name': 'Parachute', 'freebase_id': '/m/0cyfs'},
+    {'id': 404, 'name': 'Teddy bear', 'freebase_id': '/m/0kmg4'},
+    {'id': 405, 'name': 'Winter melon', 'freebase_id': '/m/02cvgx'},
+    {'id': 406, 'name': 'Deer', 'freebase_id': '/m/09kx5'},
+    {'id': 407, 'name': 'Musical keyboard', 'freebase_id': '/m/057cc'},
+    {'id': 408, 'name': 'Plumbing fixture', 'freebase_id': '/m/02pkr5'},
+    {'id': 409, 'name': 'Scoreboard', 'freebase_id': '/m/057p5t'},
+    {'id': 410, 'name': 'Baseball bat', 'freebase_id': '/m/03g8mr'},
+    {'id': 411, 'name': 'Envelope', 'freebase_id': '/m/0frqm'},
+    {'id': 412, 'name': 'Adhesive tape', 'freebase_id': '/m/03m3vtv'},
+    {'id': 413, 'name': 'Briefcase', 'freebase_id': '/m/0584n8'},
+    {'id': 414, 'name': 'Paddle', 'freebase_id': '/m/014y4n'},
+    {'id': 415, 'name': 'Bow and arrow', 'freebase_id': '/m/01g3x7'},
+    {'id': 416, 'name': 'Telephone', 'freebase_id': '/m/07cx4'},
+    {'id': 417, 'name': 'Sheep', 'freebase_id': '/m/07bgp'},
+    {'id': 418, 'name': 'Jacket', 'freebase_id': '/m/032b3c'},
+    {'id': 419, 'name': 'Boy', 'freebase_id': '/m/01bl7v'},
+    {'id': 420, 'name': 'Pizza', 'freebase_id': '/m/0663v'},
+    {'id': 421, 'name': 'Otter', 'freebase_id': '/m/0cn6p'},
+    {'id': 422, 'name': 'Office supplies', 'freebase_id': '/m/02rdsp'},
+    {'id': 423, 'name': 'Couch', 'freebase_id': '/m/02crq1'},
+    {'id': 424, 'name': 'Cello', 'freebase_id': '/m/01xqw'},
+    {'id': 425, 'name': 'Bull', 'freebase_id': '/m/0cnyhnx'},
+    {'id': 426, 'name': 'Camel', 'freebase_id': '/m/01x_v'},
+    {'id': 427, 'name': 'Ball', 'freebase_id': '/m/018xm'},
+    {'id': 428, 'name': 'Duck', 'freebase_id': '/m/09ddx'},
+    {'id': 429, 'name': 'Whale', 'freebase_id': '/m/084zz'},
+    {'id': 430, 'name': 'Shirt', 'freebase_id': '/m/01n4qj'},
+    {'id': 431, 'name': 'Tank', 'freebase_id': '/m/07cmd'},
+    {'id': 432, 'name': 'Motorcycle', 'freebase_id': '/m/04_sv'},
+    {'id': 433, 'name': 'Accordion', 'freebase_id': '/m/0mkg'},
+    {'id': 434, 'name': 'Owl', 'freebase_id': '/m/09d5_'},
+    {'id': 435, 'name': 'Porcupine', 'freebase_id': '/m/0c568'},
+    {'id': 436, 'name': 'Sun hat', 'freebase_id': '/m/02wbtzl'},
+    {'id': 437, 'name': 'Nail', 'freebase_id': '/m/05bm6'},
+    {'id': 438, 'name': 'Scissors', 'freebase_id': '/m/01lsmm'},
+    {'id': 439, 'name': 'Swan', 'freebase_id': '/m/0dftk'},
+    {'id': 440, 'name': 'Lamp', 'freebase_id': '/m/0dtln'},
+    {'id': 441, 'name': 'Crown', 'freebase_id': '/m/0nl46'},
+    {'id': 442, 'name': 'Piano', 'freebase_id': '/m/05r5c'},
+    {'id': 443, 'name': 'Sculpture', 'freebase_id': '/m/06msq'},
+    {'id': 444, 'name': 'Cheetah', 'freebase_id': '/m/0cd4d'},
+    {'id': 445, 'name': 'Oboe', 'freebase_id': '/m/05kms'},
+    {'id': 446, 'name': 'Tin can', 'freebase_id': '/m/02jnhm'},
+    {'id': 447, 'name': 'Mango', 'freebase_id': '/m/0fldg'},
+    {'id': 448, 'name': 'Tripod', 'freebase_id': '/m/073bxn'},
+    {'id': 449, 'name': 'Oven', 'freebase_id': '/m/029bxz'},
+    {'id': 450, 'name': 'Mouse', 'freebase_id': '/m/020lf'},
+    {'id': 451, 'name': 'Barge', 'freebase_id': '/m/01btn'},
+    {'id': 452, 'name': 'Coffee', 'freebase_id': '/m/02vqfm'},
+    {'id': 453, 'name': 'Snowboard', 'freebase_id': '/m/06__v'},
+    {'id': 454, 'name': 'Common fig', 'freebase_id': '/m/043nyj'},
+    {'id': 455, 'name': 'Salad', 'freebase_id': '/m/0grw1'},
+    {'id': 456, 'name': 'Marine invertebrates', 'freebase_id': '/m/03hl4l9'},
+    {'id': 457, 'name': 'Umbrella', 'freebase_id': '/m/0hnnb'},
+    {'id': 458, 'name': 'Kangaroo', 'freebase_id': '/m/04c0y'},
+    {'id': 459, 'name': 'Human arm', 'freebase_id': '/m/0dzf4'},
+    {'id': 460, 'name': 'Measuring cup', 'freebase_id': '/m/07v9_z'},
+    {'id': 461, 'name': 'Snail', 'freebase_id': '/m/0f9_l'},
+    {'id': 462, 'name': 'Loveseat', 'freebase_id': '/m/0703r8'},
+    {'id': 463, 'name': 'Suit', 'freebase_id': '/m/01xyhv'},
+    {'id': 464, 'name': 'Teapot', 'freebase_id': '/m/01fh4r'},
+    {'id': 465, 'name': 'Bottle', 'freebase_id': '/m/04dr76w'},
+    {'id': 466, 'name': 'Alpaca', 'freebase_id': '/m/0pcr'},
+    {'id': 467, 'name': 'Kettle', 'freebase_id': '/m/03s_tn'},
+    {'id': 468, 'name': 'Trousers', 'freebase_id': '/m/07mhn'},
+    {'id': 469, 'name': 'Popcorn', 'freebase_id': '/m/01hrv5'},
+    {'id': 470, 'name': 'Centipede', 'freebase_id': '/m/019h78'},
+    {'id': 471, 'name': 'Spider', 'freebase_id': '/m/09kmb'},
+    {'id': 472, 'name': 'Sparrow', 'freebase_id': '/m/0h23m'},
+    {'id': 473, 'name': 'Plate', 'freebase_id': '/m/050gv4'},
+    {'id': 474, 'name': 'Bagel', 'freebase_id': '/m/01fb_0'},
+    {'id': 475, 'name': 'Personal care', 'freebase_id': '/m/02w3_ws'},
+    {'id': 476, 'name': 'Apple', 'freebase_id': '/m/014j1m'},
+    {'id': 477, 'name': 'Brassiere', 'freebase_id': '/m/01gmv2'},
+    {'id': 478, 'name': 'Bathroom cabinet', 'freebase_id': '/m/04y4h8h'},
+    {'id': 479, 'name': 'studio couch', 'freebase_id': '/m/026qbn5'},
+    {'id': 480, 'name': 'Computer keyboard', 'freebase_id': '/m/01m2v'},
+    {'id': 481, 'name': 'Table tennis racket', 'freebase_id': '/m/05_5p_0'},
+    {'id': 482, 'name': 'Sushi', 'freebase_id': '/m/07030'},
+    {'id': 483, 'name': 'Cabinetry', 'freebase_id': '/m/01s105'},
+    {'id': 484, 'name': 'Street light', 'freebase_id': '/m/033rq4'},
+    {'id': 485, 'name': 'Towel', 'freebase_id': '/m/0162_1'},
+    {'id': 486, 'name': 'Nightstand', 'freebase_id': '/m/02z51p'},
+    {'id': 487, 'name': 'Rabbit', 'freebase_id': '/m/06mf6'},
+    {'id': 488, 'name': 'Dolphin', 'freebase_id': '/m/02hj4'},
+    {'id': 489, 'name': 'Dog', 'freebase_id': '/m/0bt9lr'},
+    {'id': 490, 'name': 'Jug', 'freebase_id': '/m/08hvt4'},
+    {'id': 491, 'name': 'Wok', 'freebase_id': '/m/084rd'},
+    {'id': 492, 'name': 'Fire hydrant', 'freebase_id': '/m/01pns0'},
+    {'id': 493, 'name': 'Human eye', 'freebase_id': '/m/014sv8'},
+    {'id': 494, 'name': 'Skyscraper', 'freebase_id': '/m/079cl'},
+    {'id': 495, 'name': 'Backpack', 'freebase_id': '/m/01940j'},
+    {'id': 496, 'name': 'Potato', 'freebase_id': '/m/05vtc'},
+    {'id': 497, 'name': 'Paper towel', 'freebase_id': '/m/02w3r3'},
+    {'id': 498, 'name': 'Lifejacket', 'freebase_id': '/m/054xkw'},
+    {'id': 499, 'name': 'Bicycle wheel', 'freebase_id': '/m/01bqk0'},
+    {'id': 500, 'name': 'Toilet', 'freebase_id': '/m/09g1w'},
+]
+def _get_builtin_metadata(cats):
+    id_to_name = {x['id']: x['name'] for x in cats}
+    thing_dataset_id_to_contiguous_id = {i + 1: i for i in range(len(cats))}
+    thing_classes = [x['name'] for x in sorted(cats, key=lambda x: x['id'])]
+    return {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes}
+_PREDEFINED_SPLITS_OID = {
+    # cat threshold: 500, 1500: r 170, c 151, f 179
+    "oid_train": ("oid/images/", "oid/annotations/oid_challenge_2019_train_bbox.json"),
+    # "expanded" duplicates annotations to their father classes based on the official
+    #   hierarchy. This is used in the official evaulation protocol.
+    #   https://storage.googleapis.com/openimages/web/evaluation.html
+    "oid_val_expanded": ("oid/images/validation/", "oid/annotations/oid_challenge_2019_val_expanded.json"),
+    "oid_val_expanded_rare": ("oid/images/validation/", "oid/annotations/oid_challenge_2019_val_expanded_rare.json"),
+}
+for key, (image_root, json_file) in _PREDEFINED_SPLITS_OID.items():
+    register_oid_instances(
+        key,
+        _get_builtin_metadata(categories),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )

proxydet/data/datasets/register_oid.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Xingyi Zhou from https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/datasets/coco.py
+import copy
+import io
+import logging
+import contextlib
+import os
+import datetime
+import json
+import numpy as np
+from PIL import Image
+from fvcore.common.timer import Timer
+from fvcore.common.file_io import PathManager, file_lock
+from detectron2.structures import BoxMode, PolygonMasks, Boxes
+from detectron2.data import DatasetCatalog, MetadataCatalog
+logger = logging.getLogger(__name__)
+"""
+This file contains functions to register a COCO-format dataset to the DatasetCatalog.
+"""
+__all__ = ["register_coco_instances", "register_coco_panoptic_separated"]
+def register_oid_instances(name, metadata, json_file, image_root):
+    """
+    """
+    # 1. register a function which returns dicts
+    DatasetCatalog.register(name, lambda: load_coco_json_mem_efficient(
+        json_file, image_root, name))
+    # 2. Optionally, add metadata about this dataset,
+    # since they might be useful in evaluation, visualization or logging
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root, evaluator_type="oid", **metadata
+    )
+def load_coco_json_mem_efficient(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
+    """
+    Actually not mem efficient
+    """
+    from pycocotools.coco import COCO
+    timer = Timer()
+    json_file = PathManager.get_local_path(json_file)
+    with contextlib.redirect_stdout(io.StringIO()):
+        coco_api = COCO(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+    id_map = None
+    if dataset_name is not None:
+        meta = MetadataCatalog.get(dataset_name)
+        cat_ids = sorted(coco_api.getCatIds())
+        cats = coco_api.loadCats(cat_ids)
+        # The categories in a custom json file may not be sorted.
+        thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
+        meta.thing_classes = thing_classes
+        if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
+            if "coco" not in dataset_name:
+                logger.warning(
+                    """
+                    Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
+                    """
+                )
+        id_map = {v: i for i, v in enumerate(cat_ids)}
+        meta.thing_dataset_id_to_contiguous_id = id_map
+    # sort indices for reproducible results
+    img_ids = sorted(coco_api.imgs.keys())
+    imgs = coco_api.loadImgs(img_ids)
+    logger.info("Loaded {} images in COCO format from {}".format(len(imgs), json_file))
+    dataset_dicts = []
+    ann_keys = ["iscrowd", "bbox", "category_id"] + (extra_annotation_keys or [])
+    for img_dict in imgs:
+        record = {}
+        record["file_name"] = os.path.join(image_root, img_dict["file_name"])
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        image_id = record["image_id"] = img_dict["id"]
+        anno_dict_list = coco_api.imgToAnns[image_id]
+        if 'neg_category_ids' in img_dict:
+            record['neg_category_ids'] = \
+                [id_map[x] for x in img_dict['neg_category_ids']]
+        objs = []
+        for anno in anno_dict_list:
+            assert anno["image_id"] == image_id
+            assert anno.get("ignore", 0) == 0
+            obj = {key: anno[key] for key in ann_keys if key in anno}
+            segm = anno.get("segmentation", None)
+            if segm:  # either list[list[float]] or dict(RLE)
+                if not isinstance(segm, dict):
+                    # filter out invalid polygons (< 3 points)
+                    segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+                    if len(segm) == 0:
+                        num_instances_without_valid_segmentation += 1
+                        continue  # ignore this instance
+                obj["segmentation"] = segm
+            obj["bbox_mode"] = BoxMode.XYWH_ABS
+            if id_map:
+                obj["category_id"] = id_map[obj["category_id"]]
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+    del coco_api
+    return dataset_dicts

proxydet/data/tar_dataset.py ADDED Viewed

	@@ -0,0 +1,138 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+import gzip
+import numpy as np
+import io
+from PIL import Image
+from torch.utils.data import Dataset
+try:
+    from PIL import UnidentifiedImageError
+    unidentified_error_available = True
+except ImportError:
+    # UnidentifiedImageError isn't available in older versions of PIL
+    unidentified_error_available = False
+class DiskTarDataset(Dataset):
+    def __init__(self,
+        tarfile_path='dataset/imagenet/ImageNet-21k/metadata/tar_files.npy',
+        tar_index_dir='dataset/imagenet/ImageNet-21k/metadata/tarindex_npy',
+        preload=False,
+        num_synsets="all"):
+        """
+        - preload (bool): Recommend to set preload to False when using
+        - num_synsets (integer or string "all"): set to small number for debugging
+            will load subset of dataset
+        """
+        tar_files = np.load(tarfile_path)
+        chunk_datasets = []
+        dataset_lens = []
+        if isinstance(num_synsets, int):
+            assert num_synsets < len(tar_files)
+            tar_files = tar_files[:num_synsets]
+        for tar_file in tar_files:
+            dataset = _TarDataset(tar_file, tar_index_dir, preload=preload)
+            chunk_datasets.append(dataset)
+            dataset_lens.append(len(dataset))
+        self.chunk_datasets = chunk_datasets
+        self.dataset_lens = np.array(dataset_lens).astype(np.int32)
+        self.dataset_cumsums = np.cumsum(self.dataset_lens)
+        self.num_samples = sum(self.dataset_lens)
+        labels = np.zeros(self.dataset_lens.sum(), dtype=np.int64)
+        sI = 0
+        for k in range(len(self.dataset_lens)):
+            assert (sI+self.dataset_lens[k]) <= len(labels), f"{k} {sI+self.dataset_lens[k]} vs. {len(labels)}"
+            labels[sI:(sI+self.dataset_lens[k])] = k
+            sI += self.dataset_lens[k]
+        self.labels = labels
+    def __len__(self):
+        return self.num_samples
+    def __getitem__(self, index):
+        assert index >= 0 and index < len(self)
+        # find the dataset file we need to go to
+        d_index = np.searchsorted(self.dataset_cumsums, index)
+        # edge case, if index is at edge of chunks, move right
+        if index in self.dataset_cumsums:
+            d_index += 1
+        assert d_index == self.labels[index], f"{d_index} vs. {self.labels[index]} mismatch for {index}"
+        # change index to local dataset index
+        if d_index == 0:
+            local_index = index
+        else:
+            local_index = index - self.dataset_cumsums[d_index - 1]
+        data_bytes = self.chunk_datasets[d_index][local_index]
+        exception_to_catch = UnidentifiedImageError if unidentified_error_available else Exception
+        try:
+            image = Image.open(data_bytes).convert("RGB")
+        except exception_to_catch:
+            image = Image.fromarray(np.ones((224,224,3), dtype=np.uint8)*128)
+            d_index = -1
+        # label is the dataset (synset) we indexed into
+        return image, d_index, index
+    def __repr__(self):
+        st = f"DiskTarDataset(subdatasets={len(self.dataset_lens)},samples={self.num_samples})"
+        return st
+class _TarDataset(object):
+    def __init__(self, filename, npy_index_dir, preload=False):
+        # translated from
+        # fbcode/experimental/deeplearning/matthijs/comp_descs/tardataset.lua
+        self.filename = filename
+        self.names = []
+        self.offsets = []
+        self.npy_index_dir = npy_index_dir
+        names, offsets = self.load_index()
+        self.num_samples = len(names)
+        if preload:
+            self.data = np.memmap(filename, mode='r', dtype='uint8')
+            self.offsets = offsets
+        else:
+            self.data = None
+    def __len__(self):
+        return self.num_samples
+    def load_index(self):
+        basename = os.path.basename(self.filename)
+        basename = os.path.splitext(basename)[0]
+        names = np.load(os.path.join(self.npy_index_dir, f"{basename}_names.npy"))
+        offsets = np.load(os.path.join(self.npy_index_dir, f"{basename}_offsets.npy"))
+        return names, offsets
+    def __getitem__(self, idx):
+        if self.data is None:
+            self.data = np.memmap(self.filename, mode='r', dtype='uint8')
+            _, self.offsets = self.load_index()
+        ofs = self.offsets[idx] * 512
+        fsize = 512 * (self.offsets[idx + 1] - self.offsets[idx])
+        data = self.data[ofs:ofs + fsize]
+        if data[:13].tostring() == '././@LongLink':
+            data = data[3 * 512:]
+        else:
+            data = data[512:]
+        # just to make it more fun a few JPEGs are GZIP compressed...
+        # catch this case
+        if tuple(data[:2]) == (0x1f, 0x8b):
+            s = io.BytesIO(data.tostring())
+            g = gzip.GzipFile(None, 'r', 0, s)
+            sdata = g.read()
+        else:
+            sdata = data.tostring()
+        return io.BytesIO(sdata)

proxydet/data/transforms/custom_augmentation_impl.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Part of the code is from https://github.com/rwightman/efficientdet-pytorch/blob/master/effdet/data/transforms.py
+# Modified by Xingyi Zhou
+# The original code is under Apache-2.0 License
+import numpy as np
+import sys
+from fvcore.transforms.transform import (
+    BlendTransform,
+    CropTransform,
+    HFlipTransform,
+    NoOpTransform,
+    Transform,
+    VFlipTransform,
+)
+from PIL import Image
+from detectron2.data.transforms.augmentation import Augmentation
+from .custom_transform import EfficientDetResizeCropTransform
+__all__ = [
+    "EfficientDetResizeCrop",
+]
+class EfficientDetResizeCrop(Augmentation):
+    """
+    Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
+    If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
+    """
+    def __init__(
+        self, size, scale, interp=Image.BILINEAR
+    ):
+        """
+        """
+        super().__init__()
+        self.target_size = (size, size)
+        self.scale = scale
+        self.interp = interp
+    def get_transform(self, img):
+        # Select a random scale factor.
+        scale_factor = np.random.uniform(*self.scale)
+        scaled_target_height = scale_factor * self.target_size[0]
+        scaled_target_width = scale_factor * self.target_size[1]
+        # Recompute the accurate scale_factor using rounded scaled image size.
+        width, height = img.shape[1], img.shape[0]
+        img_scale_y = scaled_target_height / height
+        img_scale_x = scaled_target_width / width
+        img_scale = min(img_scale_y, img_scale_x)
+        # Select non-zero random offset (x, y) if scaled image is larger than target size
+        scaled_h = int(height * img_scale)
+        scaled_w = int(width * img_scale)
+        offset_y = scaled_h - self.target_size[0]
+        offset_x = scaled_w - self.target_size[1]
+        offset_y = int(max(0.0, float(offset_y)) * np.random.uniform(0, 1))
+        offset_x = int(max(0.0, float(offset_x)) * np.random.uniform(0, 1))
+        return EfficientDetResizeCropTransform(
+            scaled_h, scaled_w, offset_y, offset_x, img_scale, self.target_size, self.interp)

proxydet/data/transforms/custom_transform.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Part of the code is from https://github.com/rwightman/efficientdet-pytorch/blob/master/effdet/data/transforms.py
+# Modified by Xingyi Zhou
+# The original code is under Apache-2.0 License
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fvcore.transforms.transform import (
+    CropTransform,
+    HFlipTransform,
+    NoOpTransform,
+    Transform,
+    TransformList,
+)
+from PIL import Image
+try:
+    import cv2  # noqa
+except ImportError:
+    # OpenCV is an optional dependency at the moment
+    pass
+__all__ = [
+    "EfficientDetResizeCropTransform",
+]
+class EfficientDetResizeCropTransform(Transform):
+    """
+    """
+    def __init__(self, scaled_h, scaled_w, offset_y, offset_x, img_scale, \
+        target_size, interp=None):
+        """
+        Args:
+            h, w (int): original image size
+            new_h, new_w (int): new image size
+            interp: PIL interpolation methods, defaults to bilinear.
+        """
+        # TODO decide on PIL vs opencv
+        super().__init__()
+        if interp is None:
+            interp = Image.BILINEAR
+        self._set_attributes(locals())
+    def apply_image(self, img, interp=None):
+        assert len(img.shape) <= 4
+        if img.dtype == np.uint8:
+            pil_image = Image.fromarray(img)
+            interp_method = interp if interp is not None else self.interp
+            pil_image = pil_image.resize((self.scaled_w, self.scaled_h), interp_method)
+            ret = np.asarray(pil_image)
+            right = min(self.scaled_w, self.offset_x + self.target_size[1])
+            lower = min(self.scaled_h, self.offset_y + self.target_size[0])
+            if len(ret.shape) <= 3:
+                ret = ret[self.offset_y: lower, self.offset_x: right]
+            else:
+                ret = ret[..., self.offset_y: lower, self.offset_x: right, :]
+        else:
+            # PIL only supports uint8
+            img = torch.from_numpy(img)
+            shape = list(img.shape)
+            shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:]
+            img = img.view(shape_4d).permute(2, 3, 0, 1)  # hw(c) -> nchw
+            _PIL_RESIZE_TO_INTERPOLATE_MODE = {Image.BILINEAR: "bilinear", Image.BICUBIC: "bicubic"}
+            mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[self.interp]
+            img = F.interpolate(img, (self.scaled_h, self.scaled_w), mode=mode, align_corners=False)
+            shape[:2] = (self.scaled_h, self.scaled_w)
+            ret = img.permute(2, 3, 0, 1).view(shape).numpy()  # nchw -> hw(c)
+            right = min(self.scaled_w, self.offset_x + self.target_size[1])
+            lower = min(self.scaled_h, self.offset_y + self.target_size[0])
+            if len(ret.shape) <= 3:
+                ret = ret[self.offset_y: lower, self.offset_x: right]
+            else:
+                ret = ret[..., self.offset_y: lower, self.offset_x: right, :]
+        return ret
+    def apply_coords(self, coords):
+        coords[:, 0] = coords[:, 0] * self.img_scale
+        coords[:, 1] = coords[:, 1] * self.img_scale
+        coords[:, 0] -= self.offset_x
+        coords[:, 1] -= self.offset_y
+        return coords
+    def apply_segmentation(self, segmentation):
+        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
+        return segmentation
+    def inverse(self):
+        raise NotImplementedError
+    def inverse_apply_coords(self, coords):
+        coords[:, 0] += self.offset_x
+        coords[:, 1] += self.offset_y
+        coords[:, 0] = coords[:, 0] / self.img_scale
+        coords[:, 1] = coords[:, 1] / self.img_scale
+        return coords
+    def inverse_apply_box(self, box: np.ndarray) -> np.ndarray:
+        """
+        """
+        idxs = np.array([(0, 1), (2, 1), (0, 3), (2, 3)]).flatten()
+        coords = np.asarray(box).reshape(-1, 4)[:, idxs].reshape(-1, 2)
+        coords = self.inverse_apply_coords(coords).reshape((-1, 4, 2))
+        minxy = coords.min(axis=1)
+        maxxy = coords.max(axis=1)
+        trans_boxes = np.concatenate((minxy, maxxy), axis=1)
+        return trans_boxes

proxydet/evaluation/custom_coco_eval.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import pickle
+from collections import OrderedDict
+import pycocotools.mask as mask_util
+import torch
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+import detectron2.utils.comm as comm
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.data.datasets.coco import convert_to_coco_json
+from detectron2.evaluation.coco_evaluation import COCOEvaluator
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table
+from ..data.datasets.coco_zeroshot import categories_seen, categories_unseen
+class CustomCOCOEvaluator(COCOEvaluator):
+    def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
+        """
+        Additionally plot mAP for 'seen classes' and 'unseen classes'
+        """
+        metrics = {
+            "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
+        }[iou_type]
+        if coco_eval is None:
+            self._logger.warn("No predictions from the model!")
+            return {metric: float("nan") for metric in metrics}
+        # the standard metrics
+        results = {
+            metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
+            for idx, metric in enumerate(metrics)
+        }
+        self._logger.info(
+            "Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
+        )
+        if not np.isfinite(sum(results.values())):
+            self._logger.info("Some metrics cannot be computed and is shown as NaN.")
+        if class_names is None or len(class_names) <= 1:
+            return results
+        # Compute per-category AP
+        # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
+        precisions = coco_eval.eval["precision"]
+        # precision has dims (iou, recall, cls, area range, max dets)
+        assert len(class_names) == precisions.shape[2]
+        seen_names = set([x['name'] for x in categories_seen])
+        unseen_names = set([x['name'] for x in categories_unseen])
+        results_per_category = []
+        results_per_category50 = []
+        results_per_category50_seen = []
+        results_per_category50_unseen = []
+        for idx, name in enumerate(class_names):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            ap = np.mean(precision) if precision.size else float("nan")
+            results_per_category.append(("{}".format(name), float(ap * 100)))
+            precision50 = precisions[0, :, idx, 0, -1]
+            precision50 = precision50[precision50 > -1]
+            ap50 = np.mean(precision50) if precision50.size else float("nan")
+            results_per_category50.append(("{}".format(name), float(ap50 * 100)))
+            if name in seen_names:
+                results_per_category50_seen.append(float(ap50 * 100))
+            if name in unseen_names:
+                results_per_category50_unseen.append(float(ap50 * 100))
+        # tabulate it
+        N_COLS = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
+        table = tabulate(
+            results_2d,
+            tablefmt="pipe",
+            floatfmt=".3f",
+            headers=["category", "AP"] * (N_COLS // 2),
+            numalign="left",
+        )
+        self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
+        N_COLS = min(6, len(results_per_category50) * 2)
+        results_flatten = list(itertools.chain(*results_per_category50))
+        results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
+        table = tabulate(
+            results_2d,
+            tablefmt="pipe",
+            floatfmt=".3f",
+            headers=["category", "AP50"] * (N_COLS // 2),
+            numalign="left",
+        )
+        self._logger.info("Per-category {} AP50: \n".format(iou_type) + table)
+        self._logger.info(
+            "Seen {} AP50: {}".format(
+                iou_type,
+                sum(results_per_category50_seen) / len(results_per_category50_seen),
+            ))
+        self._logger.info(
+            "Unseen {} AP50: {}".format(
+                iou_type,
+                sum(results_per_category50_unseen) / len(results_per_category50_unseen),
+            ))
+        results.update({"AP-" + name: ap for name, ap in results_per_category})
+        results["AP50-seen"] = sum(results_per_category50_seen) / len(results_per_category50_seen)
+        results["AP50-unseen"] = sum(results_per_category50_unseen) / len(results_per_category50_unseen)
+        return results

proxydet/evaluation/oideval.py ADDED Viewed

	@@ -0,0 +1,699 @@

+# Part of the code is from https://github.com/tensorflow/models/blob/master/research/object_detection/metrics/oid_challenge_evaluation.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# The original code is under Apache License, Version 2.0 (the "License");
+# Part of the code is from https://github.com/lvis-dataset/lvis-api/blob/master/lvis/eval.py
+# Copyright (c) 2019, Agrim Gupta and Ross Girshick
+# Modified by Xingyi Zhou
+# This script re-implement OpenImages evaluation in detectron2
+# The code is from https://github.com/xingyizhou/UniDet/blob/master/projects/UniDet/unidet/evaluation/oideval.py
+# The original code is under Apache-2.0 License
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+import datetime
+import logging
+import itertools
+from collections import OrderedDict
+from collections import defaultdict
+import copy
+import json
+import numpy as np
+import torch
+from tabulate import tabulate
+from lvis.lvis import LVIS
+from lvis.results import LVISResults
+import pycocotools.mask as mask_utils
+from fvcore.common.file_io import PathManager
+import detectron2.utils.comm as comm
+from detectron2.data import MetadataCatalog
+from detectron2.evaluation.coco_evaluation import instances_to_coco_json
+from detectron2.utils.logger import create_small_table
+from detectron2.evaluation import DatasetEvaluator
+def compute_average_precision(precision, recall):
+  """Compute Average Precision according to the definition in VOCdevkit.
+  Precision is modified to ensure that it does not decrease as recall
+  decrease.
+  Args:
+    precision: A float [N, 1] numpy array of precisions
+    recall: A float [N, 1] numpy array of recalls
+  Raises:
+    ValueError: if the input is not of the correct format
+  Returns:
+    average_precison: The area under the precision recall curve. NaN if
+      precision and recall are None.
+  """
+  if precision is None:
+    if recall is not None:
+      raise ValueError("If precision is None, recall must also be None")
+    return np.NAN
+  if not isinstance(precision, np.ndarray) or not isinstance(
+      recall, np.ndarray):
+    raise ValueError("precision and recall must be numpy array")
+  if precision.dtype != np.float or recall.dtype != np.float:
+    raise ValueError("input must be float numpy array.")
+  if len(precision) != len(recall):
+    raise ValueError("precision and recall must be of the same size.")
+  if not precision.size:
+    return 0.0
+  if np.amin(precision) < 0 or np.amax(precision) > 1:
+    raise ValueError("Precision must be in the range of [0, 1].")
+  if np.amin(recall) < 0 or np.amax(recall) > 1:
+    raise ValueError("recall must be in the range of [0, 1].")
+  if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):
+    raise ValueError("recall must be a non-decreasing array")
+  recall = np.concatenate([[0], recall, [1]])
+  precision = np.concatenate([[0], precision, [0]])
+  for i in range(len(precision) - 2, -1, -1):
+    precision[i] = np.maximum(precision[i], precision[i + 1])
+  indices = np.where(recall[1:] != recall[:-1])[0] + 1
+  average_precision = np.sum(
+      (recall[indices] - recall[indices - 1]) * precision[indices])
+  return average_precision
+class OIDEval:
+    def __init__(
+        self, lvis_gt, lvis_dt, iou_type="bbox", expand_pred_label=False,
+        oid_hierarchy_path='./datasets/oid/annotations/challenge-2019-label500-hierarchy.json'):
+        """Constructor for OIDEval.
+        Args:
+            lvis_gt (LVIS class instance, or str containing path of annotation file)
+            lvis_dt (LVISResult class instance, or str containing path of result file,
+            or list of dict)
+            iou_type (str): segm or bbox evaluation
+        """
+        self.logger = logging.getLogger(__name__)
+        if iou_type not in ["bbox", "segm"]:
+            raise ValueError("iou_type: {} is not supported.".format(iou_type))
+        if isinstance(lvis_gt, LVIS):
+            self.lvis_gt = lvis_gt
+        elif isinstance(lvis_gt, str):
+            self.lvis_gt = LVIS(lvis_gt)
+        else:
+            raise TypeError("Unsupported type {} of lvis_gt.".format(lvis_gt))
+        if isinstance(lvis_dt, LVISResults):
+            self.lvis_dt = lvis_dt
+        elif isinstance(lvis_dt, (str, list)):
+            # self.lvis_dt = LVISResults(self.lvis_gt, lvis_dt, max_dets=-1)
+            self.lvis_dt = LVISResults(self.lvis_gt, lvis_dt)
+        else:
+            raise TypeError("Unsupported type {} of lvis_dt.".format(lvis_dt))
+        if expand_pred_label:
+            oid_hierarchy = json.load(open(oid_hierarchy_path, 'r'))
+            cat_info = self.lvis_gt.dataset['categories']
+            freebase2id = {x['freebase_id']: x['id'] for x in cat_info}
+            id2freebase = {x['id']: x['freebase_id'] for x in cat_info}
+            id2name = {x['id']: x['name'] for x in cat_info}
+            fas = defaultdict(set)
+            def dfs(hierarchy, cur_id):
+                all_childs = set()
+                all_keyed_child = {}
+                if 'Subcategory' in hierarchy:
+                    for x in hierarchy['Subcategory']:
+                        childs = dfs(x, freebase2id[x['LabelName']])
+                        all_childs.update(childs)
+                if cur_id != -1:
+                    for c in all_childs:
+                        fas[c].add(cur_id)
+                all_childs.add(cur_id)
+                return all_childs
+            dfs(oid_hierarchy, -1)
+            expanded_pred = []
+            id_count = 0
+            for d in self.lvis_dt.dataset['annotations']:
+                cur_id = d['category_id']
+                ids = [cur_id] + [x for x in fas[cur_id]]
+                for cat_id in ids:
+                    new_box = copy.deepcopy(d)
+                    id_count = id_count + 1
+                    new_box['id'] = id_count
+                    new_box['category_id'] = cat_id
+                    expanded_pred.append(new_box)
+            print('Expanding original {} preds to {} preds'.format(
+                len(self.lvis_dt.dataset['annotations']),
+                len(expanded_pred)
+                ))
+            self.lvis_dt.dataset['annotations'] = expanded_pred
+            self.lvis_dt._create_index()
+        # per-image per-category evaluation results
+        self.eval_imgs = defaultdict(list)
+        self.eval = {}  # accumulated evaluation results
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self.params = Params(iou_type=iou_type)  # parameters
+        self.results = OrderedDict()
+        self.ious = {}  # ious between all gts and dts
+        self.params.img_ids = sorted(self.lvis_gt.get_img_ids())
+        self.params.cat_ids = sorted(self.lvis_gt.get_cat_ids())
+    def _to_mask(self, anns, lvis):
+        for ann in anns:
+            rle = lvis.ann_to_rle(ann)
+            ann["segmentation"] = rle
+    def _prepare(self):
+        """Prepare self._gts and self._dts for evaluation based on params."""
+        cat_ids = self.params.cat_ids if self.params.cat_ids else None
+        gts = self.lvis_gt.load_anns(
+            self.lvis_gt.get_ann_ids(img_ids=self.params.img_ids, cat_ids=cat_ids)
+        )
+        dts = self.lvis_dt.load_anns(
+            self.lvis_dt.get_ann_ids(img_ids=self.params.img_ids, cat_ids=cat_ids)
+        )
+        # convert ground truth to mask if iou_type == 'segm'
+        if self.params.iou_type == "segm":
+            self._to_mask(gts, self.lvis_gt)
+            self._to_mask(dts, self.lvis_dt)
+        for gt in gts:
+            self._gts[gt["image_id"], gt["category_id"]].append(gt)
+        # For federated dataset evaluation we will filter out all dt for an
+        # image which belong to categories not present in gt and not present in
+        # the negative list for an image. In other words detector is not penalized
+        # for categories about which we don't have gt information about their
+        # presence or absence in an image.
+        img_data = self.lvis_gt.load_imgs(ids=self.params.img_ids)
+        # per image map of categories not present in image
+        img_nl = {d["id"]: d["neg_category_ids"] for d in img_data}
+        # per image list of categories present in image
+        img_pl = {d["id"]: d["pos_category_ids"] for d in img_data}
+        # img_pl = defaultdict(set)
+        for ann in gts:
+            # img_pl[ann["image_id"]].add(ann["category_id"])
+            assert ann["category_id"] in img_pl[ann["image_id"]]
+        # print('check pos ids OK.')
+        for dt in dts:
+            img_id, cat_id = dt["image_id"], dt["category_id"]
+            if cat_id not in img_nl[img_id] and cat_id not in img_pl[img_id]:
+                continue
+            self._dts[img_id, cat_id].append(dt)
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results
+        (a list of dict) in self.eval_imgs.
+        """
+        self.logger.info("Running per image evaluation.")
+        self.logger.info("Evaluate annotation type *{}*".format(self.params.iou_type))
+        self.params.img_ids = list(np.unique(self.params.img_ids))
+        if self.params.use_cats:
+            cat_ids = self.params.cat_ids
+        else:
+            cat_ids = [-1]
+        self._prepare()
+        self.ious = {
+            (img_id, cat_id): self.compute_iou(img_id, cat_id)
+            for img_id in self.params.img_ids
+            for cat_id in cat_ids
+        }
+        # loop through images, area range, max detection number
+        print('Evaluating ...')
+        self.eval_imgs = [
+            self.evaluate_img_google(img_id, cat_id, area_rng)
+            for cat_id in cat_ids
+            for area_rng in self.params.area_rng
+            for img_id in self.params.img_ids
+        ]
+    def _get_gt_dt(self, img_id, cat_id):
+        """Create gt, dt which are list of anns/dets. If use_cats is true
+        only anns/dets corresponding to tuple (img_id, cat_id) will be
+        used. Else, all anns/dets in image are used and cat_id is not used.
+        """
+        if self.params.use_cats:
+            gt = self._gts[img_id, cat_id]
+            dt = self._dts[img_id, cat_id]
+        else:
+            gt = [
+                _ann
+                for _cat_id in self.params.cat_ids
+                for _ann in self._gts[img_id, cat_id]
+            ]
+            dt = [
+                _ann
+                for _cat_id in self.params.cat_ids
+                for _ann in self._dts[img_id, cat_id]
+            ]
+        return gt, dt
+    def compute_iou(self, img_id, cat_id):
+        gt, dt = self._get_gt_dt(img_id, cat_id)
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        # Sort detections in decreasing order of score.
+        idx = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in idx]
+        # iscrowd = [int(False)] * len(gt)
+        iscrowd = [int('iscrowd' in g and g['iscrowd'] > 0) for g in gt]
+        if self.params.iou_type == "segm":
+            ann_type = "segmentation"
+        elif self.params.iou_type == "bbox":
+            ann_type = "bbox"
+        else:
+            raise ValueError("Unknown iou_type for iou computation.")
+        gt = [g[ann_type] for g in gt]
+        dt = [d[ann_type] for d in dt]
+        # compute iou between each dt and gt region
+        # will return array of shape len(dt), len(gt)
+        ious = mask_utils.iou(dt, gt, iscrowd)
+        return ious
+    def evaluate_img_google(self, img_id, cat_id, area_rng):
+        gt, dt = self._get_gt_dt(img_id, cat_id)
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+        if len(dt) == 0:
+            return {
+                "image_id": img_id,
+                "category_id": cat_id,
+                "area_rng": area_rng,
+                "dt_ids": [],
+                "dt_matches": np.array([], dtype=np.int32).reshape(1, -1),
+                "dt_scores": [],
+                "dt_ignore": np.array([], dtype=np.int32).reshape(1, -1),
+                'num_gt': len(gt)
+            }
+        no_crowd_inds = [i for i, g in enumerate(gt) \
+            if ('iscrowd' not in g) or g['iscrowd'] == 0]
+        crowd_inds = [i for i, g in enumerate(gt) \
+            if 'iscrowd' in g and g['iscrowd'] == 1]
+        dt_idx = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        if len(self.ious[img_id, cat_id]) > 0:
+            ious = self.ious[img_id, cat_id]
+            iou = ious[:, no_crowd_inds]
+            iou = iou[dt_idx]
+            ioa = ious[:, crowd_inds]
+            ioa = ioa[dt_idx]
+        else:
+            iou = np.zeros((len(dt_idx), 0))
+            ioa = np.zeros((len(dt_idx), 0))
+        scores = np.array([dt[i]['score'] for i in dt_idx])
+        num_detected_boxes = len(dt)
+        tp_fp_labels = np.zeros(num_detected_boxes, dtype=bool)
+        is_matched_to_group_of = np.zeros(num_detected_boxes, dtype=bool)
+        def compute_match_iou(iou):
+            max_overlap_gt_ids = np.argmax(iou, axis=1)
+            is_gt_detected = np.zeros(iou.shape[1], dtype=bool)
+            for i in range(num_detected_boxes):
+                gt_id = max_overlap_gt_ids[i]
+                is_evaluatable = (not tp_fp_labels[i] and
+                                iou[i, gt_id] >= 0.5 and
+                                not is_matched_to_group_of[i])
+                if is_evaluatable:
+                    if not is_gt_detected[gt_id]:
+                        tp_fp_labels[i] = True
+                        is_gt_detected[gt_id] = True
+        def compute_match_ioa(ioa):
+            scores_group_of = np.zeros(ioa.shape[1], dtype=float)
+            tp_fp_labels_group_of = np.ones(
+                ioa.shape[1], dtype=float)
+            max_overlap_group_of_gt_ids = np.argmax(ioa, axis=1)
+            for i in range(num_detected_boxes):
+                gt_id = max_overlap_group_of_gt_ids[i]
+                is_evaluatable = (not tp_fp_labels[i] and
+                                ioa[i, gt_id] >= 0.5 and
+                                not is_matched_to_group_of[i])
+                if is_evaluatable:
+                    is_matched_to_group_of[i] = True
+                    scores_group_of[gt_id] = max(scores_group_of[gt_id], scores[i])
+            selector = np.where((scores_group_of > 0) & (tp_fp_labels_group_of > 0))
+            scores_group_of = scores_group_of[selector]
+            tp_fp_labels_group_of = tp_fp_labels_group_of[selector]
+            return scores_group_of, tp_fp_labels_group_of
+        if iou.shape[1] > 0:
+            compute_match_iou(iou)
+        scores_box_group_of = np.ndarray([0], dtype=float)
+        tp_fp_labels_box_group_of = np.ndarray([0], dtype=float)
+        if ioa.shape[1] > 0:
+            scores_box_group_of, tp_fp_labels_box_group_of = compute_match_ioa(ioa)
+        valid_entries = (~is_matched_to_group_of)
+        scores =  np.concatenate(
+            (scores[valid_entries], scores_box_group_of))
+        tp_fps = np.concatenate(
+            (tp_fp_labels[valid_entries].astype(float),
+             tp_fp_labels_box_group_of))
+        return {
+            "image_id": img_id,
+            "category_id": cat_id,
+            "area_rng": area_rng,
+            "dt_matches": np.array([1 if x > 0 else 0 for x in tp_fps], dtype=np.int32).reshape(1, -1),
+            "dt_scores": [x for x in scores],
+            "dt_ignore":  np.array([0 for x in scores], dtype=np.int32).reshape(1, -1),
+            'num_gt': len(gt)
+        }
+    def accumulate(self):
+        """Accumulate per image evaluation results and store the result in
+        self.eval.
+        """
+        self.logger.info("Accumulating evaluation results.")
+        if not self.eval_imgs:
+            self.logger.warn("Please run evaluate first.")
+        if self.params.use_cats:
+            cat_ids = self.params.cat_ids
+        else:
+            cat_ids = [-1]
+        num_thrs = 1
+        num_recalls = 1
+        num_cats = len(cat_ids)
+        num_area_rngs = 1
+        num_imgs = len(self.params.img_ids)
+        # -1 for absent categories
+        precision = -np.ones(
+            (num_thrs, num_recalls, num_cats, num_area_rngs)
+        )
+        recall = -np.ones((num_thrs, num_cats, num_area_rngs))
+        # Initialize dt_pointers
+        dt_pointers = {}
+        for cat_idx in range(num_cats):
+            dt_pointers[cat_idx] = {}
+            for area_idx in range(num_area_rngs):
+                dt_pointers[cat_idx][area_idx] = {}
+        # Per category evaluation
+        for cat_idx in range(num_cats):
+            Nk = cat_idx * num_area_rngs * num_imgs
+            for area_idx in range(num_area_rngs):
+                Na = area_idx * num_imgs
+                E = [
+                    self.eval_imgs[Nk + Na + img_idx]
+                    for img_idx in range(num_imgs)
+                ]
+                # Remove elements which are None
+                E = [e for e in E if not e is None]
+                if len(E) == 0:
+                    continue
+                dt_scores = np.concatenate([e["dt_scores"] for e in E], axis=0)
+                dt_idx = np.argsort(-dt_scores, kind="mergesort")
+                dt_scores = dt_scores[dt_idx]
+                dt_m = np.concatenate([e["dt_matches"] for e in E], axis=1)[:, dt_idx]
+                dt_ig = np.concatenate([e["dt_ignore"] for e in E], axis=1)[:, dt_idx]
+                num_gt = sum([e['num_gt'] for e in E])
+                if num_gt == 0:
+                    continue
+                tps = np.logical_and(dt_m, np.logical_not(dt_ig))
+                fps = np.logical_and(np.logical_not(dt_m), np.logical_not(dt_ig))
+                tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
+                fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
+                dt_pointers[cat_idx][area_idx] = {
+                    "tps": tps,
+                    "fps": fps,
+                }
+                for iou_thr_idx, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                    tp = np.array(tp)
+                    fp = np.array(fp)
+                    num_tp = len(tp)
+                    rc = tp / num_gt
+                    if num_tp:
+                        recall[iou_thr_idx, cat_idx, area_idx] = rc[
+                            -1
+                        ]
+                    else:
+                        recall[iou_thr_idx, cat_idx, area_idx] = 0
+                    # np.spacing(1) ~= eps
+                    pr = tp / (fp + tp + np.spacing(1))
+                    pr = pr.tolist()
+                    for i in range(num_tp - 1, 0, -1):
+                        if pr[i] > pr[i - 1]:
+                            pr[i - 1] = pr[i]
+                    mAP = compute_average_precision(
+                        np.array(pr, np.float).reshape(-1),
+                        np.array(rc, np.float).reshape(-1))
+                    precision[iou_thr_idx, :, cat_idx, area_idx] = mAP
+        self.eval = {
+            "params": self.params,
+            "counts": [num_thrs, num_recalls, num_cats, num_area_rngs],
+            "date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "precision": precision,
+            "recall": recall,
+            "dt_pointers": dt_pointers,
+        }
+    def _summarize(self, summary_type):
+        s = self.eval["precision"]
+        if len(s[s > -1]) == 0:
+            mean_s = -1
+        else:
+            mean_s = np.mean(s[s > -1])
+            # print(s.reshape(1, 1, -1, 1))
+        return mean_s
+    def summarize(self):
+        """Compute and display summary metrics for evaluation results."""
+        if not self.eval:
+            raise RuntimeError("Please run accumulate() first.")
+        max_dets = self.params.max_dets
+        self.results["AP50"] = self._summarize('ap')
+    def run(self):
+        """Wrapper function which calculates the results."""
+        self.evaluate()
+        self.accumulate()
+        self.summarize()
+    def print_results(self):
+        template = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} catIds={:>3s}] = {:0.3f}"
+        for key, value in self.results.items():
+            max_dets = self.params.max_dets
+            if "AP" in key:
+                title = "Average Precision"
+                _type = "(AP)"
+            else:
+                title = "Average Recall"
+                _type = "(AR)"
+            if len(key) > 2 and key[2].isdigit():
+                iou_thr = (float(key[2:]) / 100)
+                iou = "{:0.2f}".format(iou_thr)
+            else:
+                iou = "{:0.2f}:{:0.2f}".format(
+                    self.params.iou_thrs[0], self.params.iou_thrs[-1]
+                )
+            cat_group_name = "all"
+            area_rng = "all"
+            print(template.format(title, _type, iou, area_rng, max_dets, cat_group_name, value))
+    def get_results(self):
+        if not self.results:
+            self.logger.warn("results is empty. Call run().")
+        return self.results
+class Params:
+    def __init__(self, iou_type):
+        self.img_ids = []
+        self.cat_ids = []
+        # np.arange causes trouble.  the data point on arange is slightly
+        # larger than the true value
+        self.iou_thrs = np.linspace(
+            0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True
+        )
+        self.google_style = True
+        # print('Using google style PR curve')
+        self.iou_thrs = self.iou_thrs[:1]
+        self.max_dets = 1000
+        self.area_rng = [
+            [0 ** 2, 1e5 ** 2],
+        ]
+        self.area_rng_lbl = ["all"]
+        self.use_cats = 1
+        self.iou_type = iou_type
+class OIDEvaluator(DatasetEvaluator):
+    def __init__(self, dataset_name, cfg, distributed, output_dir=None):
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+        self._metadata = MetadataCatalog.get(dataset_name)
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        self._oid_api = LVIS(json_file)
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the LVIS evaluation server).
+        self._do_evaluation = len(self._oid_api.get_ann_ids()) > 0
+        self._mask_on = cfg.MODEL.MASK_ON
+    def reset(self):
+        self._predictions = []
+        self._oid_results = []
+    def process(self, inputs, outputs):
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+            instances = output["instances"].to(self._cpu_device)
+            prediction["instances"] = instances_to_coco_json(
+                instances, input["image_id"])
+            self._predictions.append(prediction)
+    def evaluate(self):
+        if self._distributed:
+            comm.synchronize()
+            self._predictions = comm.gather(self._predictions, dst=0)
+            self._predictions = list(itertools.chain(*self._predictions))
+            if not comm.is_main_process():
+                return
+        if len(self._predictions) == 0:
+            self._logger.warning("[LVISEvaluator] Did not receive valid predictions.")
+            return {}
+        self._logger.info("Preparing results in the OID format ...")
+        self._oid_results = list(
+            itertools.chain(*[x["instances"] for x in self._predictions]))
+        # unmap the category ids for LVIS (from 0-indexed to 1-indexed)
+        for result in self._oid_results:
+            result["category_id"] += 1
+        PathManager.mkdirs(self._output_dir)
+        file_path = os.path.join(
+            self._output_dir, "oid_instances_results.json")
+        self._logger.info("Saving results to {}".format(file_path))
+        with PathManager.open(file_path, "w") as f:
+            f.write(json.dumps(self._oid_results))
+            f.flush()
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info("Evaluating predictions ...")
+        self._results = OrderedDict()
+        res, mAP = _evaluate_predictions_on_oid(
+            self._oid_api,
+            file_path,
+            eval_seg=self._mask_on,
+            class_names=self._metadata.get("thing_classes"),
+        )
+        self._results['bbox'] = res
+        mAP_out_path = os.path.join(self._output_dir, "oid_mAP.npy")
+        self._logger.info('Saving mAP to' + mAP_out_path)
+        np.save(mAP_out_path, mAP)
+        return copy.deepcopy(self._results)
+def _evaluate_predictions_on_oid(
+    oid_gt, oid_results_path, eval_seg=False,
+    class_names=None):
+    logger = logging.getLogger(__name__)
+    metrics = ["AP50", "AP50_expand"]
+    results = {}
+    oid_eval = OIDEval(oid_gt, oid_results_path, 'bbox', expand_pred_label=False)
+    oid_eval.run()
+    oid_eval.print_results()
+    results["AP50"] = oid_eval.get_results()["AP50"]
+    if eval_seg:
+        oid_eval = OIDEval(oid_gt, oid_results_path, 'segm', expand_pred_label=False)
+        oid_eval.run()
+        oid_eval.print_results()
+        results["AP50_segm"] = oid_eval.get_results()["AP50"]
+    else:
+        oid_eval = OIDEval(oid_gt, oid_results_path, 'bbox', expand_pred_label=True)
+        oid_eval.run()
+        oid_eval.print_results()
+        results["AP50_expand"] = oid_eval.get_results()["AP50"]
+    mAP = np.zeros(len(class_names)) - 1
+    precisions = oid_eval.eval['precision']
+    assert len(class_names) == precisions.shape[2]
+    results_per_category = []
+    id2apiid = sorted(oid_gt.get_cat_ids())
+    inst_aware_ap, inst_count = 0, 0
+    for idx, name in enumerate(class_names):
+        precision = precisions[:, :, idx, 0]
+        precision = precision[precision > -1]
+        ap = np.mean(precision) if precision.size else float("nan")
+        inst_num = len(oid_gt.get_ann_ids(cat_ids=[id2apiid[idx]]))
+        if inst_num > 0:
+            results_per_category.append(("{} {}".format(
+                name.replace(' ', '_'),
+                inst_num if inst_num < 1000 else '{:.1f}k'.format(inst_num / 1000)),
+                float(ap * 100)))
+            inst_aware_ap += inst_num * ap
+            inst_count += inst_num
+            mAP[idx] = ap
+            # logger.info("{} {} {:.2f}".format(name, inst_num, ap * 100))
+    inst_aware_ap = inst_aware_ap * 100 / inst_count
+    N_COLS = min(6, len(results_per_category) * 2)
+    results_flatten = list(itertools.chain(*results_per_category))
+    results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
+    table = tabulate(
+        results_2d,
+        tablefmt="pipe",
+        floatfmt=".3f",
+        headers=["category", "AP"] * (N_COLS // 2),
+        numalign="left",
+    )
+    logger.info("Per-category {} AP: \n".format('bbox') + table)
+    logger.info("Instance-aware {} AP: {:.4f}".format('bbox', inst_aware_ap))
+    logger.info("Evaluation results for bbox: \n" + \
+        create_small_table(results))
+    return results, mAP

proxydet/modeling/backbone/swintransformer.py ADDED Viewed

	@@ -0,0 +1,750 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu, Yutong Lin, Yixuan Wei
+# --------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Xingyi Zhou from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import numpy as np
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+from detectron2.modeling.backbone.fpn import FPN
+from centernet.modeling.backbone.fpn_p5 import LastLevelP6P7_P5
+from centernet.modeling.backbone.bifpn import BiFPN
+# from .checkpoint import load_checkpoint
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer)
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(Backbone):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+        self._out_features = ['swin{}'.format(i) for i in self.out_indices]
+        self._out_feature_channels = {
+            'swin{}'.format(i): self.embed_dim * 2 ** i for i in self.out_indices
+        }
+        self._out_feature_strides = {
+            'swin{}'.format(i): 2 ** (i + 2) for i in self.out_indices
+        }
+        self._size_devisibility = 32
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        if isinstance(pretrained, str):
+            self.apply(_init_weights)
+            # load_checkpoint(self, pretrained, strict=False)
+        elif pretrained is None:
+            self.apply(_init_weights)
+        else:
+            raise TypeError('pretrained must be a str or None')
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        # outs = []
+        outs = {}
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                # outs.append(out)
+                outs['swin{}'.format(i)] = out
+        return outs
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+size2config = {
+    'T': {
+        'window_size': 7,
+        'embed_dim': 96,
+        'depth': [2, 2, 6, 2],
+        'num_heads': [3, 6, 12, 24],
+        'drop_path_rate': 0.2,
+        'pretrained': 'models/swin_tiny_patch4_window7_224.pth'
+    },
+    'S': {
+        'window_size': 7,
+        'embed_dim': 96,
+        'depth': [2, 2, 18, 2],
+        'num_heads': [3, 6, 12, 24],
+        'drop_path_rate': 0.2,
+        'pretrained': 'models/swin_small_patch4_window7_224.pth'
+    },
+    'B': {
+        'window_size': 7,
+        'embed_dim': 128,
+        'depth': [2, 2, 18, 2],
+        'num_heads': [4, 8, 16, 32],
+        'drop_path_rate': 0.3,
+        'pretrained': 'models/swin_base_patch4_window7_224.pth'
+    },
+    'B-22k': {
+        'window_size': 7,
+        'embed_dim': 128,
+        'depth': [2, 2, 18, 2],
+        'num_heads': [4, 8, 16, 32],
+        'drop_path_rate': 0.3,
+        'pretrained': 'models/swin_base_patch4_window7_224_22k.pth'
+    },
+    'B-22k-384': {
+        'window_size': 12,
+        'embed_dim': 128,
+        'depth': [2, 2, 18, 2],
+        'num_heads': [4, 8, 16, 32],
+        'drop_path_rate': 0.3,
+        'pretrained': 'models/swin_base_patch4_window12_384_22k.pth'
+    },
+    'L-22k': {
+        'window_size': 7,
+        'embed_dim': 192,
+        'depth': [2, 2, 18, 2],
+        'num_heads': [6, 12, 24, 48],
+        'drop_path_rate': 0.3, # TODO (xingyi): this is unclear
+        'pretrained': 'models/swin_large_patch4_window7_224_22k.pth'
+    },
+    'L-22k-384': {
+        'window_size': 12,
+        'embed_dim': 192,
+        'depth': [2, 2, 18, 2],
+        'num_heads': [6, 12, 24, 48],
+        'drop_path_rate': 0.3, # TODO (xingyi): this is unclear
+        'pretrained': 'models/swin_large_patch4_window12_384_22k.pth'
+    }
+}
+@BACKBONE_REGISTRY.register()
+def build_swintransformer_backbone(cfg, input_shape):
+    """
+    """
+    config = size2config[cfg.MODEL.SWIN.SIZE]
+    out_indices = cfg.MODEL.SWIN.OUT_FEATURES
+    model = SwinTransformer(
+        embed_dim=config['embed_dim'],
+        window_size=config['window_size'],
+        depths=config['depth'],
+        num_heads=config['num_heads'],
+        drop_path_rate=config['drop_path_rate'],
+        out_indices=out_indices,
+        frozen_stages=-1,
+        use_checkpoint=cfg.MODEL.SWIN.USE_CHECKPOINT
+    )
+    # print('Initializing', config['pretrained'])
+    model.init_weights(config['pretrained'])
+    return model
+@BACKBONE_REGISTRY.register()
+def build_swintransformer_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    """
+    bottom_up = build_swintransformer_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelP6P7_P5(out_channels, out_channels),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
+@BACKBONE_REGISTRY.register()
+def build_swintransformer_bifpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    """
+    bottom_up = build_swintransformer_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    backbone = BiFPN(
+        cfg=cfg,
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=cfg.MODEL.BIFPN.OUT_CHANNELS,
+        norm=cfg.MODEL.BIFPN.NORM,
+        num_levels=cfg.MODEL.BIFPN.NUM_LEVELS,
+        num_bifpn=cfg.MODEL.BIFPN.NUM_BIFPN,
+        separable_conv=cfg.MODEL.BIFPN.SEPARABLE_CONV,
+    )
+    return backbone

proxydet/modeling/backbone/timm.py ADDED Viewed

	@@ -0,0 +1,221 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from os.path import join
+import numpy as np
+import copy
+from functools import partial
+import torch
+from torch import nn
+import torch.utils.model_zoo as model_zoo
+import torch.nn.functional as F
+import fvcore.nn.weight_init as weight_init
+from detectron2.modeling.backbone import FPN
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+from detectron2.layers.batch_norm import get_norm, FrozenBatchNorm2d
+from detectron2.modeling.backbone import Backbone
+from timm import create_model
+from timm.models.helpers import build_model_with_cfg
+from timm.models.registry import register_model
+from timm.models.resnet import ResNet, Bottleneck
+from timm.models.resnet import default_cfgs as default_cfgs_resnet
+from timm.models.convnext import ConvNeXt, default_cfgs, checkpoint_filter_fn
+@register_model
+def convnext_tiny_21k(pretrained=False, **kwargs):
+    model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs)
+    cfg = default_cfgs['convnext_tiny']
+    cfg['url'] = 'https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth'
+    model = build_model_with_cfg(
+        ConvNeXt, 'convnext_tiny', pretrained,
+        default_cfg=cfg,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True),
+        **model_args)
+    return model
+class CustomResNet(ResNet):
+    def __init__(self, **kwargs):
+        self.out_indices = kwargs.pop('out_indices')
+        super().__init__(**kwargs)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.maxpool(x)
+        ret = [x]
+        x = self.layer1(x)
+        ret.append(x)
+        x = self.layer2(x)
+        ret.append(x)
+        x = self.layer3(x)
+        ret.append(x)
+        x = self.layer4(x)
+        ret.append(x)
+        return [ret[i] for i in self.out_indices]
+    def load_pretrained(self, cached_file):
+        data = torch.load(cached_file, map_location='cpu')
+        if 'state_dict' in data:
+            self.load_state_dict(data['state_dict'])
+        else:
+            self.load_state_dict(data)
+model_params = {
+    'resnet50_in21k': dict(block=Bottleneck, layers=[3, 4, 6, 3]),
+}
+def create_timm_resnet(variant, out_indices, pretrained=False, **kwargs):
+    params = model_params[variant]
+    default_cfgs_resnet['resnet50_in21k'] = \
+        copy.deepcopy(default_cfgs_resnet['resnet50'])
+    default_cfgs_resnet['resnet50_in21k']['url'] = \
+        'https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/resnet50_miil_21k.pth'
+    default_cfgs_resnet['resnet50_in21k']['num_classes'] = 11221
+    return build_model_with_cfg(
+        CustomResNet, variant, pretrained,
+        default_cfg=default_cfgs_resnet[variant],
+        out_indices=out_indices,
+        pretrained_custom_load=True,
+        **params,
+        **kwargs)
+class LastLevelP6P7_P5(nn.Module):
+    """
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.num_levels = 2
+        self.in_feature = "p5"
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        for module in [self.p6, self.p7]:
+            weight_init.c2_xavier_fill(module)
+    def forward(self, c5):
+        p6 = self.p6(c5)
+        p7 = self.p7(F.relu(p6))
+        return [p6, p7]
+def freeze_module(x):
+    """
+    """
+    for p in x.parameters():
+        p.requires_grad = False
+    FrozenBatchNorm2d.convert_frozen_batchnorm(x)
+    return x
+class TIMM(Backbone):
+    def __init__(self, base_name, out_levels, freeze_at=0, norm='FrozenBN', pretrained=False):
+        super().__init__()
+        out_indices = [x - 1 for x in out_levels]
+        if base_name in model_params:
+            self.base = create_timm_resnet(
+                base_name, out_indices=out_indices,
+                pretrained=False)
+        elif 'eff' in base_name or 'resnet' in base_name or 'regnet' in base_name:
+            self.base = create_model(
+                base_name, features_only=True,
+                out_indices=out_indices, pretrained=pretrained)
+        elif 'convnext' in base_name:
+            drop_path_rate = 0.2 \
+                if ('tiny' in base_name or 'small' in base_name) else 0.3
+            self.base = create_model(
+                base_name, features_only=True,
+                out_indices=out_indices, pretrained=pretrained,
+                drop_path_rate=drop_path_rate)
+        else:
+            assert 0, base_name
+        feature_info = [dict(num_chs=f['num_chs'], reduction=f['reduction']) \
+            for i, f in enumerate(self.base.feature_info)]
+        self._out_features = ['layer{}'.format(x) for x in out_levels]
+        self._out_feature_channels = {
+            'layer{}'.format(l): feature_info[l - 1]['num_chs'] for l in out_levels}
+        self._out_feature_strides = {
+            'layer{}'.format(l): feature_info[l - 1]['reduction'] for l in out_levels}
+        self._size_divisibility = max(self._out_feature_strides.values())
+        if 'resnet' in base_name:
+            self.freeze(freeze_at)
+        if norm == 'FrozenBN':
+            self = FrozenBatchNorm2d.convert_frozen_batchnorm(self)
+    def freeze(self, freeze_at=0):
+        """
+        """
+        if freeze_at >= 1:
+            print('Frezing', self.base.conv1)
+            self.base.conv1 = freeze_module(self.base.conv1)
+        if freeze_at >= 2:
+            print('Frezing', self.base.layer1)
+            self.base.layer1 = freeze_module(self.base.layer1)
+    def forward(self, x):
+        features = self.base(x)
+        ret = {k: v for k, v in zip(self._out_features, features)}
+        return ret
+    @property
+    def size_divisibility(self):
+        return self._size_divisibility
+@BACKBONE_REGISTRY.register()
+def build_timm_backbone(cfg, input_shape):
+    model = TIMM(
+        cfg.MODEL.TIMM.BASE_NAME,
+        cfg.MODEL.TIMM.OUT_LEVELS,
+        freeze_at=cfg.MODEL.TIMM.FREEZE_AT,
+        norm=cfg.MODEL.TIMM.NORM,
+        pretrained=cfg.MODEL.TIMM.PRETRAINED,
+    )
+    return model
+@BACKBONE_REGISTRY.register()
+def build_p67_timm_fpn_backbone(cfg, input_shape):
+    """
+    """
+    bottom_up = build_timm_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelP6P7_P5(out_channels, out_channels),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
+@BACKBONE_REGISTRY.register()
+def build_p35_timm_fpn_backbone(cfg, input_shape):
+    """
+    """
+    bottom_up = build_timm_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=None,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone

proxydet/modeling/debug.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import os
+COLORS = ((np.random.rand(1300, 3) * 0.4 + 0.6) * 255).astype(
+  np.uint8).reshape(1300, 1, 1, 3)
+def _get_color_image(heatmap):
+  heatmap = heatmap.reshape(
+    heatmap.shape[0], heatmap.shape[1], heatmap.shape[2], 1)
+  if heatmap.shape[0] == 1:
+      color_map = (heatmap * np.ones((1, 1, 1, 3), np.uint8) * 255).max(
+          axis=0).astype(np.uint8) # H, W, 3
+  else:
+      color_map = (heatmap * COLORS[:heatmap.shape[0]]).max(axis=0).astype(np.uint8) # H, W, 3
+  return color_map
+def _blend_image(image, color_map, a=0.7):
+  color_map = cv2.resize(color_map, (image.shape[1], image.shape[0]))
+  ret = np.clip(image * (1 - a) + color_map * a, 0, 255).astype(np.uint8)
+  return ret
+def _blend_image_heatmaps(image, color_maps, a=0.7):
+    merges = np.zeros((image.shape[0], image.shape[1], 3), np.float32)
+    for color_map in color_maps:
+        color_map = cv2.resize(color_map, (image.shape[1], image.shape[0]))
+        merges = np.maximum(merges, color_map)
+    ret = np.clip(image * (1 - a) + merges * a, 0, 255).astype(np.uint8)
+    return ret
+def _decompose_level(x, shapes_per_level, N):
+    '''
+    x: LNHiWi x C
+    '''
+    x = x.view(x.shape[0], -1)
+    ret = []
+    st = 0
+    for l in range(len(shapes_per_level)):
+        ret.append([])
+        h = shapes_per_level[l][0].int().item()
+        w = shapes_per_level[l][1].int().item()
+        for i in range(N):
+            ret[l].append(x[st + h * w * i:st + h * w * (i + 1)].view(
+                h, w, -1).permute(2, 0, 1))
+        st += h * w * N
+    return ret
+def _imagelist_to_tensor(images):
+    images = [x for x in images]
+    image_sizes = [x.shape[-2:] for x in images]
+    h = max([size[0] for size in image_sizes])
+    w = max([size[1] for size in image_sizes])
+    S = 32
+    h, w = ((h - 1) // S + 1) * S, ((w - 1) // S + 1) * S
+    images = [F.pad(x, (0, w - x.shape[2], 0, h - x.shape[1], 0, 0)) \
+        for x in images]
+    images = torch.stack(images)
+    return images
+def _ind2il(ind, shapes_per_level, N):
+    r = ind
+    l = 0
+    S = 0
+    while r - S >= N * shapes_per_level[l][0] * shapes_per_level[l][1]:
+        S += N * shapes_per_level[l][0] * shapes_per_level[l][1]
+        l += 1
+    i = (r - S) // (shapes_per_level[l][0] * shapes_per_level[l][1])
+    return i, l
+def debug_train(
+    images, gt_instances, flattened_hms, reg_targets, labels, pos_inds,
+    shapes_per_level, locations, strides):
+    '''
+    images: N x 3 x H x W
+    flattened_hms: LNHiWi x C
+    shapes_per_level: L x 2 [(H_i, W_i)]
+    locations: LNHiWi x 2
+    '''
+    reg_inds = torch.nonzero(
+        reg_targets.max(dim=1)[0] > 0).squeeze(1)
+    N = len(images)
+    images = _imagelist_to_tensor(images)
+    repeated_locations = [torch.cat([loc] * N, dim=0) \
+        for loc in locations]
+    locations = torch.cat(repeated_locations, dim=0)
+    gt_hms = _decompose_level(flattened_hms, shapes_per_level, N)
+    masks = flattened_hms.new_zeros((flattened_hms.shape[0], 1))
+    masks[pos_inds] = 1
+    masks = _decompose_level(masks, shapes_per_level, N)
+    for i in range(len(images)):
+        image = images[i].detach().cpu().numpy().transpose(1, 2, 0)
+        color_maps = []
+        for l in range(len(gt_hms)):
+            color_map = _get_color_image(
+                gt_hms[l][i].detach().cpu().numpy())
+            color_maps.append(color_map)
+            cv2.imshow('gthm_{}'.format(l), color_map)
+        blend = _blend_image_heatmaps(image.copy(), color_maps)
+        if gt_instances is not None:
+            bboxes = gt_instances[i].gt_boxes.tensor
+            for j in range(len(bboxes)):
+                bbox = bboxes[j]
+                cv2.rectangle(
+                    blend,
+                    (int(bbox[0]), int(bbox[1])),
+                    (int(bbox[2]), int(bbox[3])),
+                    (0, 0, 255), 3, cv2.LINE_AA)
+        for j in range(len(pos_inds)):
+            image_id, l = _ind2il(pos_inds[j], shapes_per_level, N)
+            if image_id != i:
+                continue
+            loc = locations[pos_inds[j]]
+            cv2.drawMarker(
+                blend, (int(loc[0]), int(loc[1])), (0, 255, 255),
+                markerSize=(l + 1) * 16)
+        for j in range(len(reg_inds)):
+            image_id, l = _ind2il(reg_inds[j], shapes_per_level, N)
+            if image_id != i:
+                continue
+            ltrb = reg_targets[reg_inds[j]]
+            ltrb *= strides[l]
+            loc = locations[reg_inds[j]]
+            bbox = [(loc[0] - ltrb[0]), (loc[1] - ltrb[1]),
+                    (loc[0] + ltrb[2]), (loc[1] + ltrb[3])]
+            cv2.rectangle(
+                blend,
+                (int(bbox[0]), int(bbox[1])),
+                (int(bbox[2]), int(bbox[3])),
+                (255, 0, 0), 1, cv2.LINE_AA)
+            cv2.circle(blend, (int(loc[0]), int(loc[1])), 2, (255, 0, 0), -1)
+        cv2.imshow('blend', blend)
+        cv2.waitKey()
+def debug_test(
+    images, logits_pred, reg_pred, agn_hm_pred=[], preds=[],
+    vis_thresh=0.3, debug_show_name=False, mult_agn=False):
+    '''
+    images: N x 3 x H x W
+    class_target: LNHiWi x C
+    cat_agn_heatmap: LNHiWi
+    shapes_per_level: L x 2 [(H_i, W_i)]
+    '''
+    N = len(images)
+    for i in range(len(images)):
+        image = images[i].detach().cpu().numpy().transpose(1, 2, 0)
+        result = image.copy().astype(np.uint8)
+        pred_image = image.copy().astype(np.uint8)
+        color_maps = []
+        L = len(logits_pred)
+        for l in range(L):
+            if logits_pred[0] is not None:
+                stride = min(image.shape[0], image.shape[1]) / min(
+                    logits_pred[l][i].shape[1], logits_pred[l][i].shape[2])
+            else:
+                stride = min(image.shape[0], image.shape[1]) / min(
+                    agn_hm_pred[l][i].shape[1], agn_hm_pred[l][i].shape[2])
+            stride = stride if stride < 60 else 64 if stride < 100 else 128
+            if logits_pred[0] is not None:
+                if mult_agn:
+                    logits_pred[l][i] = logits_pred[l][i] * agn_hm_pred[l][i]
+                color_map = _get_color_image(
+                    logits_pred[l][i].detach().cpu().numpy())
+                color_maps.append(color_map)
+                cv2.imshow('predhm_{}'.format(l), color_map)
+            if debug_show_name:
+                from detectron2.data.datasets.lvis_v1_categories import LVIS_CATEGORIES
+                cat2name = [x['name'] for x in LVIS_CATEGORIES]
+            for j in range(len(preds[i].scores) if preds is not None else 0):
+                if preds[i].scores[j] > vis_thresh:
+                    bbox = preds[i].proposal_boxes[j] \
+                        if preds[i].has('proposal_boxes') else \
+                        preds[i].pred_boxes[j]
+                    bbox = bbox.tensor[0].detach().cpu().numpy().astype(np.int32)
+                    cat = int(preds[i].pred_classes[j]) \
+                        if preds[i].has('pred_classes') else 0
+                    cl = COLORS[cat, 0, 0]
+                    cv2.rectangle(
+                        pred_image, (int(bbox[0]), int(bbox[1])),
+                        (int(bbox[2]), int(bbox[3])),
+                        (int(cl[0]), int(cl[1]), int(cl[2])), 2, cv2.LINE_AA)
+                    if debug_show_name:
+                        txt = '{}{:.1f}'.format(
+                            cat2name[cat] if cat > 0 else '',
+                            preds[i].scores[j])
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0]
+                        cv2.rectangle(
+                            pred_image,
+                            (int(bbox[0]), int(bbox[1] - cat_size[1] - 2)),
+                            (int(bbox[0] + cat_size[0]), int(bbox[1] - 2)),
+                            (int(cl[0]), int(cl[1]), int(cl[2])), -1)
+                        cv2.putText(
+                            pred_image, txt, (int(bbox[0]), int(bbox[1] - 2)),
+                            font, 0.5, (0, 0, 0), thickness=1, lineType=cv2.LINE_AA)
+            if agn_hm_pred[l] is not None:
+                agn_hm_ = agn_hm_pred[l][i, 0, :, :, None].detach().cpu().numpy()
+                agn_hm_ = (agn_hm_ * np.array([255, 255, 255]).reshape(
+                    1, 1, 3)).astype(np.uint8)
+                cv2.imshow('agn_hm_{}'.format(l), agn_hm_)
+        blend = _blend_image_heatmaps(image.copy(), color_maps)
+        cv2.imshow('blend', blend)
+        cv2.imshow('preds', pred_image)
+        cv2.waitKey()
+global cnt
+cnt = 0
+def debug_second_stage(images, instances, proposals=None, vis_thresh=0.3,
+    save_debug=False, debug_show_name=False, image_labels=[],
+    save_debug_path='output/save_debug/',
+    bgr=False):
+    images = _imagelist_to_tensor(images)
+    if 'COCO' in save_debug_path:
+        from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+        cat2name = [x['name'] for x in COCO_CATEGORIES]
+    else:
+        from detectron2.data.datasets.lvis_v1_categories import LVIS_CATEGORIES
+        cat2name = ['({}){}'.format(x['frequency'], x['name']) \
+            for x in LVIS_CATEGORIES]
+    for i in range(len(images)):
+        image = images[i].detach().cpu().numpy().transpose(1, 2, 0).astype(np.uint8).copy()
+        if bgr:
+            image = image[:, :, ::-1].copy()
+        if instances[i].has('gt_boxes'):
+            bboxes = instances[i].gt_boxes.tensor.cpu().numpy()
+            scores = np.ones(bboxes.shape[0])
+            cats = instances[i].gt_classes.cpu().numpy()
+        else:
+            bboxes = instances[i].pred_boxes.tensor.cpu().numpy()
+            scores = instances[i].scores.cpu().numpy()
+            cats = instances[i].pred_classes.cpu().numpy()
+        for j in range(len(bboxes)):
+            if scores[j] > vis_thresh:
+                bbox = bboxes[j]
+                cl = COLORS[cats[j], 0, 0]
+                cl = (int(cl[0]), int(cl[1]), int(cl[2]))
+                cv2.rectangle(
+                    image,
+                    (int(bbox[0]), int(bbox[1])),
+                    (int(bbox[2]), int(bbox[3])),
+                    cl, 2, cv2.LINE_AA)
+                if debug_show_name:
+                    cat = cats[j]
+                    txt = '{}{:.1f}'.format(
+                        cat2name[cat] if cat > 0 else '',
+                        scores[j])
+                    font = cv2.FONT_HERSHEY_SIMPLEX
+                    cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0]
+                    cv2.rectangle(
+                        image,
+                        (int(bbox[0]), int(bbox[1] - cat_size[1] - 2)),
+                        (int(bbox[0] + cat_size[0]), int(bbox[1] - 2)),
+                        (int(cl[0]), int(cl[1]), int(cl[2])), -1)
+                    cv2.putText(
+                        image, txt, (int(bbox[0]), int(bbox[1] - 2)),
+                        font, 0.5, (0, 0, 0), thickness=1, lineType=cv2.LINE_AA)
+        if proposals is not None:
+            proposal_image = images[i].detach().cpu().numpy().transpose(1, 2, 0).astype(np.uint8).copy()
+            if bgr:
+                proposal_image = proposal_image.copy()
+            else:
+                proposal_image = proposal_image[:, :, ::-1].copy()
+            bboxes = proposals[i].proposal_boxes.tensor.cpu().numpy()
+            if proposals[i].has('scores'):
+                scores = proposals[i].scores.detach().cpu().numpy()
+            else:
+                scores = proposals[i].objectness_logits.detach().cpu().numpy()
+            # selected = -1
+            # if proposals[i].has('image_loss'):
+            #     selected = proposals[i].image_loss.argmin()
+            if proposals[i].has('selected'):
+                selected = proposals[i].selected
+            else:
+                selected = [-1 for _ in range(len(bboxes))]
+            for j in range(len(bboxes)):
+                if scores[j] > vis_thresh or selected[j] >= 0:
+                    bbox = bboxes[j]
+                    cl = (209, 159, 83)
+                    th = 2
+                    if selected[j] >= 0:
+                        cl = (0, 0, 0xa4)
+                        th = 4
+                    cv2.rectangle(
+                        proposal_image,
+                        (int(bbox[0]), int(bbox[1])),
+                        (int(bbox[2]), int(bbox[3])),
+                        cl, th, cv2.LINE_AA)
+                    if selected[j] >= 0 and debug_show_name:
+                        cat = selected[j].item()
+                        txt = '{}'.format(cat2name[cat])
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0]
+                        cv2.rectangle(
+                            proposal_image,
+                            (int(bbox[0]), int(bbox[1] - cat_size[1] - 2)),
+                            (int(bbox[0] + cat_size[0]), int(bbox[1] - 2)),
+                            (int(cl[0]), int(cl[1]), int(cl[2])), -1)
+                        cv2.putText(
+                            proposal_image, txt,
+                            (int(bbox[0]), int(bbox[1] - 2)),
+                            font, 0.5, (0, 0, 0), thickness=1,
+                            lineType=cv2.LINE_AA)
+        if save_debug:
+            global cnt
+            cnt = (cnt + 1) % 5000
+            if not os.path.exists(save_debug_path):
+                os.mkdir(save_debug_path)
+            save_name = '{}/{:05d}.jpg'.format(save_debug_path, cnt)
+            if i < len(image_labels):
+                image_label = image_labels[i]
+                save_name = '{}/{:05d}'.format(save_debug_path, cnt)
+                for x in image_label:
+                    class_name = cat2name[x]
+                    save_name = save_name + '|{}'.format(class_name)
+                save_name = save_name + '.jpg'
+            cv2.imwrite(save_name, proposal_image)
+        else:
+            cv2.imshow('image', image)
+            if proposals is not None:
+                cv2.imshow('proposals', proposal_image)
+            cv2.waitKey()

proxydet/modeling/meta_arch/custom_rcnn.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+import torch
+from torch import nn
+import json
+from detectron2.utils.events import get_event_storage
+from detectron2.config import configurable
+from detectron2.structures import ImageList, Instances, Boxes
+import detectron2.utils.comm as comm
+from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from detectron2.modeling.meta_arch.rcnn import GeneralizedRCNN
+from detectron2.modeling.postprocessing import detector_postprocess
+from detectron2.utils.visualizer import Visualizer, _create_text_labels
+from detectron2.data.detection_utils import convert_image_to_rgb
+from torch.cuda.amp import autocast
+from ..text.text_encoder import build_text_encoder
+from ..utils import load_class_freq, get_fed_loss_inds
+@META_ARCH_REGISTRY.register()
+class CustomRCNN(GeneralizedRCNN):
+    '''
+    Add image labels
+    '''
+    @configurable
+    def __init__(
+        self,
+        with_image_labels = False,
+        dataset_loss_weight = [],
+        fp16 = False,
+        sync_caption_batch = False,
+        roi_head_name = '',
+        cap_batch_ratio = 4,
+        with_caption = False,
+        dynamic_classifier = False,
+        **kwargs):
+        """
+        """
+        self.with_image_labels = with_image_labels
+        self.dataset_loss_weight = dataset_loss_weight
+        self.fp16 = fp16
+        self.with_caption = with_caption
+        self.sync_caption_batch = sync_caption_batch
+        self.roi_head_name = roi_head_name
+        self.cap_batch_ratio = cap_batch_ratio
+        self.dynamic_classifier = dynamic_classifier
+        self.return_proposal = False
+        if self.dynamic_classifier:
+            self.freq_weight = kwargs.pop('freq_weight')
+            self.num_classes = kwargs.pop('num_classes')
+            self.num_sample_cats = kwargs.pop('num_sample_cats')
+        super().__init__(**kwargs)
+        assert self.proposal_generator is not None
+        if self.with_caption:
+            assert not self.dynamic_classifier
+            self.text_encoder = build_text_encoder(pretrain=True)
+            for v in self.text_encoder.parameters():
+                v.requires_grad = False
+    @classmethod
+    def from_config(cls, cfg):
+        ret = super().from_config(cfg)
+        ret.update({
+            'with_image_labels': cfg.WITH_IMAGE_LABELS,
+            'dataset_loss_weight': cfg.MODEL.DATASET_LOSS_WEIGHT,
+            'fp16': cfg.FP16,
+            'with_caption': cfg.MODEL.WITH_CAPTION,
+            'sync_caption_batch': cfg.MODEL.SYNC_CAPTION_BATCH,
+            'dynamic_classifier': cfg.MODEL.DYNAMIC_CLASSIFIER,
+            'roi_head_name': cfg.MODEL.ROI_HEADS.NAME,
+            'cap_batch_ratio': cfg.MODEL.CAP_BATCH_RATIO,
+        })
+        if ret['dynamic_classifier']:
+            ret['freq_weight'] = load_class_freq(
+                cfg.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH,
+                cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT)
+            ret['num_classes'] = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+            ret['num_sample_cats'] = cfg.MODEL.NUM_SAMPLE_CATS
+        return ret
+    def inference(
+        self,
+        batched_inputs: Tuple[Dict[str, torch.Tensor]],
+        detected_instances: Optional[List[Instances]] = None,
+        do_postprocess: bool = True,
+    ):
+        assert not self.training
+        assert detected_instances is None
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+        proposals, _ = self.proposal_generator(images, features, None)
+        results, _ = self.roi_heads(images, features, proposals)
+        if do_postprocess:
+            assert not torch.jit.is_scripting(), \
+                "Scripting is not supported for postprocess."
+            return CustomRCNN._postprocess(
+                results, batched_inputs, images.image_sizes)
+        else:
+            return results
+    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        """
+        Add ann_type
+        Ignore proposal loss when training with image labels
+        """
+        if not self.training:
+            return self.inference(batched_inputs)
+        images = self.preprocess_image(batched_inputs)
+        ann_type = 'box'
+        gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        if self.with_image_labels:
+            for inst, x in zip(gt_instances, batched_inputs):
+                inst._ann_type = x['ann_type']
+                inst._pos_category_ids = x['pos_category_ids']
+            ann_types = [x['ann_type'] for x in batched_inputs]
+            assert len(set(ann_types)) == 1
+            ann_type = ann_types[0]
+            if ann_type in ['prop', 'proptag']:
+                for t in gt_instances:
+                    t.gt_classes *= 0
+        if self.fp16: # TODO (zhouxy): improve
+            with autocast():
+                features = self.backbone(images.tensor.half())
+            features = {k: v.float() for k, v in features.items()}
+        else:
+            features = self.backbone(images.tensor)
+        cls_features, cls_inds, caption_features = None, None, None
+        if self.with_caption and 'caption' in ann_type:
+            inds = [torch.randint(len(x['captions']), (1,))[0].item() \
+                for x in batched_inputs]
+            caps = [x['captions'][ind] for ind, x in zip(inds, batched_inputs)]
+            caption_features = self.text_encoder(caps).float()
+        if self.sync_caption_batch:
+            caption_features = self._sync_caption_features(
+                caption_features, ann_type, len(batched_inputs))
+        if self.dynamic_classifier and ann_type != 'caption':
+            cls_inds = self._sample_cls_inds(gt_instances, ann_type) # inds, inv_inds
+            ind_with_bg = cls_inds[0].tolist() + [-1]
+            cls_features = self.roi_heads.box_predictor[
+                0].cls_score.zs_weight[:, ind_with_bg].permute(1, 0).contiguous()
+        classifier_info = cls_features, cls_inds, caption_features
+        proposals, proposal_losses = self.proposal_generator(
+            images, features, gt_instances)
+        if self.roi_head_name in ['StandardROIHeads', 'CascadeROIHeads']:
+            proposals, detector_losses = self.roi_heads(
+                images, features, proposals, gt_instances)
+        else:
+            proposals, detector_losses = self.roi_heads(
+                images, features, proposals, gt_instances,
+                ann_type=ann_type, classifier_info=classifier_info)
+        if self.vis_period > 0:
+            storage = get_event_storage()
+            if storage.iter % self.vis_period == 0:
+                self.visualize_training(batched_inputs, proposals)
+        losses = {}
+        losses.update(detector_losses)
+        if self.with_image_labels:
+            if ann_type in ['box', 'prop', 'proptag']:
+                losses.update(proposal_losses)
+            else: # ignore proposal loss for non-bbox data
+                losses.update({k: v * 0 for k, v in proposal_losses.items()})
+        else:
+            losses.update(proposal_losses)
+        if len(self.dataset_loss_weight) > 0:
+            dataset_sources = [x['dataset_source'] for x in batched_inputs]
+            assert len(set(dataset_sources)) == 1
+            dataset_source = dataset_sources[0]
+            for k in losses:
+                losses[k] *= self.dataset_loss_weight[dataset_source]
+        if self.return_proposal:
+            return proposals, losses
+        else:
+            return losses
+    def _sync_caption_features(self, caption_features, ann_type, BS):
+        has_caption_feature = (caption_features is not None)
+        BS = (BS * self.cap_batch_ratio) if (ann_type == 'box') else BS
+        rank = torch.full(
+            (BS, 1), comm.get_rank(), dtype=torch.float32,
+            device=self.device)
+        if not has_caption_feature:
+            caption_features = rank.new_zeros((BS, 512))
+        caption_features = torch.cat([caption_features, rank], dim=1)
+        global_caption_features = comm.all_gather(caption_features)
+        caption_features = torch.cat(
+            [x.to(self.device) for x in global_caption_features], dim=0) \
+                if has_caption_feature else None # (NB) x (D + 1)
+        return caption_features
+    def _sample_cls_inds(self, gt_instances, ann_type='box'):
+        if ann_type == 'box':
+            gt_classes = torch.cat(
+                [x.gt_classes for x in gt_instances])
+            C = len(self.freq_weight)
+            freq_weight = self.freq_weight
+        else:
+            gt_classes = torch.cat(
+                [torch.tensor(
+                    x._pos_category_ids,
+                    dtype=torch.long, device=x.gt_classes.device) \
+                    for x in gt_instances])
+            C = self.num_classes
+            freq_weight = None
+        assert gt_classes.max() < C, '{} {}'.format(gt_classes.max(), C)
+        inds = get_fed_loss_inds(
+            gt_classes, self.num_sample_cats, C,
+            weight=freq_weight)
+        cls_id_map = gt_classes.new_full(
+            (self.num_classes + 1,), len(inds))
+        cls_id_map[inds] = torch.arange(len(inds), device=cls_id_map.device)
+        return inds, cls_id_map

proxydet/modeling/meta_arch/d2_deformable_detr.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+import torch.nn.functional as F
+from torch import nn
+import math
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone
+from detectron2.structures import Boxes, Instances
+from ..utils import load_class_freq, get_fed_loss_inds
+from models.backbone import Joiner
+from models.deformable_detr import DeformableDETR, SetCriterion, MLP
+from models.deformable_detr import _get_clones
+from models.matcher import HungarianMatcher
+from models.position_encoding import PositionEmbeddingSine
+from models.deformable_transformer import DeformableTransformer
+from models.segmentation import sigmoid_focal_loss
+from util.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
+from util.misc import NestedTensor, accuracy
+__all__ = ["DeformableDetr"]
+class CustomSetCriterion(SetCriterion):
+    def __init__(self, num_classes, matcher, weight_dict, losses, \
+        focal_alpha=0.25, use_fed_loss=False):
+        super().__init__(num_classes, matcher, weight_dict, losses, focal_alpha)
+        self.use_fed_loss = use_fed_loss
+        if self.use_fed_loss:
+            self.register_buffer(
+                'fed_loss_weight', load_class_freq(freq_weight=0.5))
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        target_classes_onehot = torch.zeros(
+            [src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
+            dtype=src_logits.dtype, layout=src_logits.layout,
+            device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+        target_classes_onehot = target_classes_onehot[:,:,:-1] # B x N x C
+        if self.use_fed_loss:
+            inds = get_fed_loss_inds(
+                gt_classes=target_classes_o,
+                num_sample_cats=50,
+                weight=self.fed_loss_weight,
+                C=target_classes_onehot.shape[2])
+            loss_ce = sigmoid_focal_loss(
+                src_logits[:, :, inds],
+                target_classes_onehot[:, :, inds],
+                num_boxes,
+                alpha=self.focal_alpha,
+                gamma=2) * src_logits.shape[1]
+        else:
+            loss_ce = sigmoid_focal_loss(
+                src_logits, target_classes_onehot, num_boxes,
+                alpha=self.focal_alpha,
+                gamma=2) * src_logits.shape[1]
+        losses = {'loss_ce': loss_ce}
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
+        return losses
+class MaskedBackbone(nn.Module):
+    """ This is a thin wrapper around D2's backbone to provide padding masking"""
+    def __init__(self, cfg):
+        super().__init__()
+        self.backbone = build_backbone(cfg)
+        backbone_shape = self.backbone.output_shape()
+        self.feature_strides = [backbone_shape[f].stride for f in backbone_shape.keys()]
+        self.strides = [backbone_shape[f].stride for f in backbone_shape.keys()]
+        self.num_channels = [backbone_shape[x].channels for x in backbone_shape.keys()]
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.backbone(tensor_list.tensors)
+        out = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        return out
+@META_ARCH_REGISTRY.register()
+class DeformableDetr(nn.Module):
+    """
+    Implement Deformable Detr
+    """
+    def __init__(self, cfg):
+        super().__init__()
+        self.with_image_labels = cfg.WITH_IMAGE_LABELS
+        self.weak_weight = cfg.MODEL.DETR.WEAK_WEIGHT
+        self.device = torch.device(cfg.MODEL.DEVICE)
+        self.test_topk = cfg.TEST.DETECTIONS_PER_IMAGE
+        self.num_classes = cfg.MODEL.DETR.NUM_CLASSES
+        self.mask_on = cfg.MODEL.MASK_ON
+        hidden_dim = cfg.MODEL.DETR.HIDDEN_DIM
+        num_queries = cfg.MODEL.DETR.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        nheads = cfg.MODEL.DETR.NHEADS
+        dropout = cfg.MODEL.DETR.DROPOUT
+        dim_feedforward = cfg.MODEL.DETR.DIM_FEEDFORWARD
+        enc_layers = cfg.MODEL.DETR.ENC_LAYERS
+        dec_layers = cfg.MODEL.DETR.DEC_LAYERS
+        num_feature_levels = cfg.MODEL.DETR.NUM_FEATURE_LEVELS
+        two_stage = cfg.MODEL.DETR.TWO_STAGE
+        with_box_refine = cfg.MODEL.DETR.WITH_BOX_REFINE
+        # Loss parameters:
+        giou_weight = cfg.MODEL.DETR.GIOU_WEIGHT
+        l1_weight = cfg.MODEL.DETR.L1_WEIGHT
+        deep_supervision = cfg.MODEL.DETR.DEEP_SUPERVISION
+        cls_weight = cfg.MODEL.DETR.CLS_WEIGHT
+        focal_alpha = cfg.MODEL.DETR.FOCAL_ALPHA
+        N_steps = hidden_dim // 2
+        d2_backbone = MaskedBackbone(cfg)
+        backbone = Joiner(d2_backbone, PositionEmbeddingSine(N_steps, normalize=True))
+        transformer = DeformableTransformer(
+            d_model=hidden_dim,
+            nhead=nheads,
+            num_encoder_layers=enc_layers,
+            num_decoder_layers=dec_layers,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            activation="relu",
+            return_intermediate_dec=True,
+            num_feature_levels=num_feature_levels,
+            dec_n_points=4,
+            enc_n_points=4,
+            two_stage=two_stage,
+            two_stage_num_proposals=num_queries)
+        self.detr = DeformableDETR(
+            backbone, transformer, num_classes=self.num_classes,
+            num_queries=num_queries,
+            num_feature_levels=num_feature_levels,
+            aux_loss=deep_supervision,
+            with_box_refine=with_box_refine,
+            two_stage=two_stage,
+        )
+        if self.mask_on:
+            assert 0, 'Mask is not supported yet :('
+        matcher = HungarianMatcher(
+            cost_class=cls_weight, cost_bbox=l1_weight, cost_giou=giou_weight)
+        weight_dict = {"loss_ce": cls_weight, "loss_bbox": l1_weight}
+        weight_dict["loss_giou"] = giou_weight
+        if deep_supervision:
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+        print('weight_dict', weight_dict)
+        losses = ["labels", "boxes", "cardinality"]
+        if self.mask_on:
+            losses += ["masks"]
+        self.criterion = CustomSetCriterion(
+            self.num_classes, matcher=matcher, weight_dict=weight_dict,
+            focal_alpha=focal_alpha,
+            losses=losses,
+            use_fed_loss=cfg.MODEL.DETR.USE_FED_LOSS
+        )
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+    def forward(self, batched_inputs):
+        """
+        Args:
+        Returns:
+            dict[str: Tensor]:
+                mapping from a named loss to a tensor storing the loss. Used during training only.
+        """
+        images = self.preprocess_image(batched_inputs)
+        output = self.detr(images)
+        if self.training:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+            targets = self.prepare_targets(gt_instances)
+            loss_dict = self.criterion(output, targets)
+            weight_dict = self.criterion.weight_dict
+            for k in loss_dict.keys():
+                if k in weight_dict:
+                    loss_dict[k] *= weight_dict[k]
+            if self.with_image_labels:
+                if batched_inputs[0]['ann_type'] in ['image', 'captiontag']:
+                    loss_dict['loss_image'] = self.weak_weight * self._weak_loss(
+                        output, batched_inputs)
+                else:
+                    loss_dict['loss_image'] = images[0].new_zeros(
+                        [1], dtype=torch.float32)[0]
+                # import pdb; pdb.set_trace()
+            return loss_dict
+        else:
+            image_sizes = output["pred_boxes"].new_tensor(
+                [(t["height"], t["width"]) for t in batched_inputs])
+            results = self.post_process(output, image_sizes)
+            return results
+    def prepare_targets(self, targets):
+        new_targets = []
+        for targets_per_image in targets:
+            h, w = targets_per_image.image_size
+            image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float, device=self.device)
+            gt_classes = targets_per_image.gt_classes
+            gt_boxes = targets_per_image.gt_boxes.tensor / image_size_xyxy
+            gt_boxes = box_xyxy_to_cxcywh(gt_boxes)
+            new_targets.append({"labels": gt_classes, "boxes": gt_boxes})
+            if self.mask_on and hasattr(targets_per_image, 'gt_masks'):
+                assert 0, 'Mask is not supported yet :('
+                gt_masks = targets_per_image.gt_masks
+                gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
+                new_targets[-1].update({'masks': gt_masks})
+        return new_targets
+    def post_process(self, outputs, target_sizes):
+        """
+        """
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(
+            prob.view(out_logits.shape[0], -1), self.test_topk, dim=1)
+        scores = topk_values
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = box_cxcywh_to_xyxy(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+        results = []
+        for s, l, b, size in zip(scores, labels, boxes, target_sizes):
+            r = Instances((size[0], size[1]))
+            r.pred_boxes = Boxes(b)
+            r.scores = s
+            r.pred_classes = l
+            results.append({'instances': r})
+        return results
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [self.normalizer(x["image"].to(self.device)) for x in batched_inputs]
+        return images
+    def _weak_loss(self, outputs, batched_inputs):
+        loss = 0
+        for b, x in enumerate(batched_inputs):
+            labels = x['pos_category_ids']
+            pred_logits = [outputs['pred_logits'][b]]
+            pred_boxes = [outputs['pred_boxes'][b]]
+            for xx in outputs['aux_outputs']:
+                pred_logits.append(xx['pred_logits'][b])
+                pred_boxes.append(xx['pred_boxes'][b])
+            pred_logits = torch.stack(pred_logits, dim=0) # L x N x C
+            pred_boxes = torch.stack(pred_boxes, dim=0) # L x N x 4
+            for label in labels:
+                loss += self._max_size_loss(
+                    pred_logits, pred_boxes, label) / len(labels)
+        loss = loss / len(batched_inputs)
+        return loss
+    def _max_size_loss(self, logits, boxes, label):
+        '''
+        Inputs:
+          logits: L x N x C
+          boxes: L x N x 4
+        '''
+        target = logits.new_zeros((logits.shape[0], logits.shape[2]))
+        target[:, label] = 1.
+        sizes = boxes[..., 2] * boxes[..., 3] # L x N
+        ind = sizes.argmax(dim=1) # L
+        loss = F.binary_cross_entropy_with_logits(
+            logits[range(len(ind)), ind], target, reduction='sum')
+        return loss

proxydet/modeling/roi_heads/proxydet_fast_rcnn.py ADDED Viewed

	@@ -0,0 +1,618 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+'''
+    Modifications Copyright (c) 2024-present NAVER Corp, Apache License v2.0
+    original source: https://github.com/facebookresearch/Detic/blob/main/detic/modeling/roi_heads/detic_fast_rcnn.py
+'''
+import logging
+import math
+import json
+import numpy as np
+from typing import Dict, Union
+import torch
+from fvcore.nn import giou_loss, smooth_l1_loss
+from torch import nn
+from torch.nn import functional as F
+import fvcore.nn.weight_init as weight_init
+import detectron2.utils.comm as comm
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple
+from detectron2.structures import Boxes, Instances
+from detectron2.utils.events import get_event_storage
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers
+from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference
+from detectron2.modeling.roi_heads.fast_rcnn import _log_classification_stats
+from torch.cuda.amp import autocast
+from ..utils import load_class_freq, get_fed_loss_inds
+from .zero_shot_classifier import ZeroShotClassifier
+__all__ = ["ProxydetFastRCNNOutputLayers"]
+class ProxydetFastRCNNOutputLayers(FastRCNNOutputLayers):
+    @configurable
+    def __init__(
+        self,
+        input_shape: ShapeSpec,
+        *,
+        mult_proposal_score=False,
+        cls_score=None,
+        sync_caption_batch = False,
+        use_sigmoid_ce = False,
+        use_fed_loss = False,
+        ignore_zero_cats = False,
+        fed_loss_num_cat = 50,
+        dynamic_classifier = False,
+        image_label_loss = '',
+        use_zeroshot_cls = False,
+        image_loss_weight = 0.1,
+        with_softmax_prop = False,
+        caption_weight = 1.0,
+        neg_cap_weight = 1.0,
+        add_image_box = False,
+        debug = False,
+        prior_prob = 0.01,
+        cat_freq_path = '',
+        fed_loss_freq_weight = 0.5,
+        softmax_weak_loss = False,
+        use_regional_embedding=False,
+        base_cat_mask: str = None,
+        **kwargs,
+    ):
+        super().__init__(
+            input_shape=input_shape,
+            **kwargs,
+        )
+        self.mult_proposal_score = mult_proposal_score
+        self.sync_caption_batch = sync_caption_batch
+        self.use_sigmoid_ce = use_sigmoid_ce
+        self.use_fed_loss = use_fed_loss
+        self.ignore_zero_cats = ignore_zero_cats
+        self.fed_loss_num_cat = fed_loss_num_cat
+        self.dynamic_classifier = dynamic_classifier
+        self.image_label_loss = image_label_loss
+        self.use_zeroshot_cls = use_zeroshot_cls
+        self.image_loss_weight = image_loss_weight
+        self.with_softmax_prop = with_softmax_prop
+        self.caption_weight = caption_weight
+        self.neg_cap_weight = neg_cap_weight
+        self.add_image_box = add_image_box
+        self.softmax_weak_loss = softmax_weak_loss
+        self.debug = debug
+        self.use_regional_embedding = use_regional_embedding
+        self.base_cat_mask = torch.tensor(np.load(base_cat_mask).nonzero()[0])
+        if softmax_weak_loss:
+            assert image_label_loss in ['max_size']
+        if self.use_sigmoid_ce:
+            bias_value = -math.log((1 - prior_prob) / prior_prob)
+            nn.init.constant_(self.cls_score.bias, bias_value)
+        if self.use_fed_loss or self.ignore_zero_cats:
+            freq_weight = load_class_freq(cat_freq_path, fed_loss_freq_weight)
+            self.register_buffer('freq_weight', freq_weight)
+        else:
+            self.freq_weight = None
+        if self.use_fed_loss and len(self.freq_weight) < self.num_classes:
+            # assert self.num_classes == 11493
+            print('Extending federated loss weight')
+            self.freq_weight = torch.cat(
+                [self.freq_weight,
+                self.freq_weight.new_zeros(
+                    self.num_classes - len(self.freq_weight))]
+            )
+        assert (not self.dynamic_classifier) or (not self.use_fed_loss)
+        input_size = input_shape.channels * \
+            (input_shape.width or 1) * (input_shape.height or 1)
+        if self.use_zeroshot_cls:
+            del self.cls_score
+            del self.bbox_pred
+            assert cls_score is not None
+            self.cls_score = cls_score
+            self.bbox_pred = nn.Sequential(
+                nn.Linear(input_size, input_size),
+                nn.ReLU(inplace=True),
+                nn.Linear(input_size, 4)
+            )
+            weight_init.c2_xavier_fill(self.bbox_pred[0])
+            nn.init.normal_(self.bbox_pred[-1].weight, std=0.001)
+            nn.init.constant_(self.bbox_pred[-1].bias, 0)
+        if self.with_softmax_prop:
+            self.prop_score = nn.Sequential(
+                nn.Linear(input_size, input_size),
+                nn.ReLU(inplace=True),
+                nn.Linear(input_size, self.num_classes + 1),
+            )
+            weight_init.c2_xavier_fill(self.prop_score[0])
+            nn.init.normal_(self.prop_score[-1].weight, mean=0, std=0.001)
+            nn.init.constant_(self.prop_score[-1].bias, 0)
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        ret.update({
+            'mult_proposal_score': cfg.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE,
+            'sync_caption_batch': cfg.MODEL.SYNC_CAPTION_BATCH,
+            'use_sigmoid_ce': cfg.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE,
+            'use_fed_loss': cfg.MODEL.ROI_BOX_HEAD.USE_FED_LOSS,
+            'ignore_zero_cats': cfg.MODEL.ROI_BOX_HEAD.IGNORE_ZERO_CATS,
+            'fed_loss_num_cat': cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CAT,
+            'dynamic_classifier': cfg.MODEL.DYNAMIC_CLASSIFIER,
+            'image_label_loss': cfg.MODEL.ROI_BOX_HEAD.IMAGE_LABEL_LOSS,
+            'use_zeroshot_cls': cfg.MODEL.ROI_BOX_HEAD.USE_ZEROSHOT_CLS,
+            'image_loss_weight': cfg.MODEL.ROI_BOX_HEAD.IMAGE_LOSS_WEIGHT,
+            'with_softmax_prop': cfg.MODEL.ROI_BOX_HEAD.WITH_SOFTMAX_PROP,
+            'caption_weight': cfg.MODEL.ROI_BOX_HEAD.CAPTION_WEIGHT,
+            'neg_cap_weight': cfg.MODEL.ROI_BOX_HEAD.NEG_CAP_WEIGHT,
+            'add_image_box': cfg.MODEL.ROI_BOX_HEAD.ADD_IMAGE_BOX,
+            'debug': cfg.DEBUG or cfg.SAVE_DEBUG or cfg.IS_DEBUG,
+            'prior_prob': cfg.MODEL.ROI_BOX_HEAD.PRIOR_PROB,
+            'cat_freq_path': cfg.MODEL.ROI_BOX_HEAD.CAT_FREQ_PATH,
+            'fed_loss_freq_weight': cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT,
+            'softmax_weak_loss': cfg.MODEL.ROI_BOX_HEAD.SOFTMAX_WEAK_LOSS,
+            "use_regional_embedding": cfg.MODEL.ROI_BOX_HEAD.USE_REGIONAL_EMBEDDING,
+            "base_cat_mask": cfg.MODEL.ROI_HEADS.BASE_CAT_MASK,
+        })
+        if ret['use_zeroshot_cls']:
+            ret['cls_score'] = ZeroShotClassifier(cfg, input_shape)
+        return ret
+    def losses(self, predictions, proposals, \
+        use_advanced_loss=True,
+        classifier_info=(None,None,None)):
+        """
+        enable advanced loss
+        """
+        scores, proposal_deltas = predictions
+        gt_classes = (
+            cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)
+        )
+        num_classes = self.num_classes
+        if self.dynamic_classifier:
+            _, cls_id_map = classifier_info[1]
+            gt_classes = cls_id_map[gt_classes]
+            num_classes = scores.shape[1] - 1
+            assert cls_id_map[self.num_classes] == num_classes
+        _log_classification_stats(scores, gt_classes)
+        if len(proposals):
+            proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)  # Nx4
+            assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
+            gt_boxes = cat(
+                [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals],
+                dim=0,
+            )
+        else:
+            proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device)
+        if self.use_sigmoid_ce:
+            loss_cls = self.sigmoid_cross_entropy_loss(scores, gt_classes)
+        else:
+            loss_cls = self.softmax_cross_entropy_loss(scores, gt_classes)
+        return {
+            "loss_cls": loss_cls,
+            "loss_box_reg": self.box_reg_loss(
+                proposal_boxes, gt_boxes, proposal_deltas, gt_classes,
+                num_classes=num_classes)
+        }
+    def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes):
+        if pred_class_logits.numel() == 0:
+            return pred_class_logits.new_zeros([1])[0] # This is more robust than .sum() * 0.
+        B = pred_class_logits.shape[0]
+        C = pred_class_logits.shape[1] - 1
+        target = pred_class_logits.new_zeros(B, C + 1)
+        target[range(len(gt_classes)), gt_classes] = 1 # B x (C + 1)
+        target = target[:, :C] # B x C
+        weight = 1
+        if self.use_fed_loss and (self.freq_weight is not None): # fedloss
+            appeared = get_fed_loss_inds(
+                gt_classes,
+                num_sample_cats=self.fed_loss_num_cat,
+                C=C,
+                weight=self.freq_weight)
+            appeared_mask = appeared.new_zeros(C + 1)
+            appeared_mask[appeared] = 1 # C + 1
+            appeared_mask = appeared_mask[:C]
+            fed_w = appeared_mask.view(1, C).expand(B, C)
+            weight = weight * fed_w.float()
+        if self.ignore_zero_cats and (self.freq_weight is not None):
+            w = (self.freq_weight.view(-1) > 1e-4).float()
+            weight = weight * w.view(1, C).expand(B, C)
+            # import pdb; pdb.set_trace()
+        cls_loss = F.binary_cross_entropy_with_logits(
+            pred_class_logits[:, :-1], target, reduction="none"
+        )  # B x C
+        loss = torch.sum(cls_loss * weight) / B
+        return loss
+    def softmax_cross_entropy_loss(self, pred_class_logits, gt_classes):
+        """
+        change _no_instance handling
+        """
+        if pred_class_logits.numel() == 0:
+            return pred_class_logits.new_zeros([1])[0]
+        if self.ignore_zero_cats and (self.freq_weight is not None):
+            zero_weight = torch.cat([
+                (self.freq_weight.view(-1) > 1e-4).float(),
+                self.freq_weight.new_ones(1)]) # C + 1
+            loss = F.cross_entropy(
+                pred_class_logits, gt_classes,
+                weight=zero_weight, reduction="mean")
+        elif self.use_fed_loss and (self.freq_weight is not None): # fedloss
+            C = pred_class_logits.shape[1] - 1
+            appeared = get_fed_loss_inds(
+                gt_classes,
+                num_sample_cats=self.fed_loss_num_cat,
+                C=C,
+                weight=self.freq_weight)
+            appeared_mask = appeared.new_zeros(C + 1).float()
+            appeared_mask[appeared] = 1. # C + 1
+            appeared_mask[C] = 1.
+            loss = F.cross_entropy(
+                pred_class_logits, gt_classes,
+                weight=appeared_mask, reduction="mean")
+        else:
+            loss = F.cross_entropy(
+                pred_class_logits, gt_classes, reduction="mean")
+        return loss
+    def box_reg_loss(
+        self, proposal_boxes, gt_boxes, pred_deltas, gt_classes,
+        num_classes=-1):
+        """
+        Allow custom background index
+        """
+        num_classes = num_classes if num_classes > 0 else self.num_classes
+        box_dim = proposal_boxes.shape[1]  # 4 or 5
+        fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < num_classes))[0]
+        if pred_deltas.shape[1] == box_dim:  # cls-agnostic regression
+            fg_pred_deltas = pred_deltas[fg_inds]
+        else:
+            fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[
+                fg_inds, gt_classes[fg_inds]
+            ]
+        if self.box_reg_loss_type == "smooth_l1":
+            gt_pred_deltas = self.box2box_transform.get_deltas(
+                proposal_boxes[fg_inds],
+                gt_boxes[fg_inds],
+            )
+            loss_box_reg = smooth_l1_loss(
+                fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="sum"
+            )
+        elif self.box_reg_loss_type == "giou":
+            fg_pred_boxes = self.box2box_transform.apply_deltas(
+                fg_pred_deltas, proposal_boxes[fg_inds]
+            )
+            loss_box_reg = giou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum")
+        else:
+            raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")
+        return loss_box_reg / max(gt_classes.numel(), 1.0)
+    def inference(self, predictions, proposals):
+        """
+        enable use proposal boxes
+        """
+        predictions = (predictions[0], predictions[1])
+        boxes = self.predict_boxes(predictions, proposals)
+        scores = self.predict_probs(predictions, proposals)
+        if self.mult_proposal_score:
+            proposal_scores = [p.get('objectness_logits') for p in proposals]
+            scores = [(s * ps[:, None]) ** 0.5 \
+                for s, ps in zip(scores, proposal_scores)]
+        image_shapes = [x.image_size for x in proposals]
+        return fast_rcnn_inference(
+            boxes,
+            scores,
+            image_shapes,
+            self.test_score_thresh,
+            self.test_nms_thresh,
+            self.test_topk_per_image,
+        )
+    def predict_probs(self, predictions, proposals):
+        """
+        support sigmoid
+        """
+        # scores, _ = predictions
+        scores = predictions[0]
+        num_inst_per_image = [len(p) for p in proposals]
+        if self.use_sigmoid_ce:
+            probs = scores.sigmoid()
+        else:
+            probs = F.softmax(scores, dim=-1)
+        return probs.split(num_inst_per_image, dim=0)
+    def image_label_losses(self, predictions, proposals, image_labels, \
+        classifier_info=(None,None,None), ann_type='image'):
+        '''
+        Inputs:
+            scores: N x (C + 1)
+            image_labels B x 1
+        '''
+        num_inst_per_image = [len(p) for p in proposals]
+        scores = predictions[0]
+        scores = scores.split(num_inst_per_image, dim=0) # B x n x (C + 1)
+        if self.with_softmax_prop:
+            prop_scores = predictions[2].split(num_inst_per_image, dim=0)
+        else:
+            prop_scores = [None for _ in num_inst_per_image]
+        B = len(scores)
+        img_box_count = 0
+        select_size_count = 0
+        select_x_count = 0
+        select_y_count = 0
+        max_score_count = 0
+        storage = get_event_storage()
+        loss = scores[0].new_zeros([1])[0]
+        caption_loss = scores[0].new_zeros([1])[0]
+        for idx, (score, labels, prop_score, p) in enumerate(zip(
+            scores, image_labels, prop_scores, proposals)):
+            if score.shape[0] == 0:
+                loss += score.new_zeros([1])[0]
+                continue
+            if 'caption' in ann_type:
+                score, caption_loss_img = self._caption_loss(
+                    score, classifier_info, idx, B)
+                caption_loss += self.caption_weight * caption_loss_img
+                if ann_type == 'caption':
+                    continue
+            if self.debug:
+                p.selected = score.new_zeros(
+                    (len(p),), dtype=torch.long) - 1
+            for i_l, label in enumerate(labels):
+                if self.dynamic_classifier:
+                    if idx == 0 and i_l == 0 and comm.is_main_process():
+                        storage.put_scalar('stats_label', label)
+                    label = classifier_info[1][1][label]
+                    assert label < score.shape[1]
+                if self.image_label_loss in ['wsod', 'wsddn']:
+                    loss_i, ind = self._wsddn_loss(score, prop_score, label)
+                elif self.image_label_loss == 'max_score':
+                    loss_i, ind = self._max_score_loss(score, label)
+                elif self.image_label_loss == 'max_size':
+                    loss_i, ind = self._max_size_loss(score, label, p)
+                elif self.image_label_loss == 'first':
+                    loss_i, ind = self._first_loss(score, label)
+                elif self.image_label_loss == 'image':
+                    loss_i, ind = self._image_loss(score, label)
+                elif self.image_label_loss == 'min_loss':
+                    loss_i, ind = self._min_loss_loss(score, label)
+                else:
+                    assert 0
+                loss += loss_i / len(labels)
+                if type(ind) == type([]):
+                    img_box_count = sum(ind) / len(ind)
+                    if self.debug:
+                        for ind_i in ind:
+                            p.selected[ind_i] = label
+                else:
+                    img_box_count = ind
+                    select_size_count = p[ind].proposal_boxes.area() / \
+                        (p.image_size[0] * p.image_size[1])
+                    max_score_count = score[ind, label].sigmoid()
+                    select_x_count = (p.proposal_boxes.tensor[ind, 0] + \
+                        p.proposal_boxes.tensor[ind, 2]) / 2 / p.image_size[1]
+                    select_y_count = (p.proposal_boxes.tensor[ind, 1] + \
+                        p.proposal_boxes.tensor[ind, 3]) / 2 / p.image_size[0]
+                    if self.debug:
+                        p.selected[ind] = label
+        loss = loss / B
+        storage.put_scalar('stats_l_image', loss.item())
+        if 'caption' in ann_type:
+            caption_loss = caption_loss / B
+            loss = loss + caption_loss
+            storage.put_scalar('stats_l_caption', caption_loss.item())
+        if comm.is_main_process():
+            storage.put_scalar('pool_stats', img_box_count)
+            storage.put_scalar('stats_select_size', select_size_count)
+            storage.put_scalar('stats_select_x', select_x_count)
+            storage.put_scalar('stats_select_y', select_y_count)
+            storage.put_scalar('stats_max_label_score', max_score_count)
+        return {
+            'image_loss': loss * self.image_loss_weight,
+            'loss_cls': score.new_zeros([1])[0],
+            'loss_box_reg': score.new_zeros([1])[0]}
+    def forward(self, x, classifier_info=(None,None,None)):
+        """
+        enable classifier_info
+        """
+        if x.dim() > 2:
+            x = torch.flatten(x, start_dim=1)
+        scores = []
+        if classifier_info[0] is not None:
+            classifier_out = self.cls_score(x, classifier=classifier_info[0])
+        else:
+            classifier_out = self.cls_score(x)
+        if self.use_regional_embedding:
+            cls_scores, regional_embeddings = classifier_out
+        else:
+            cls_scores = classifier_out
+        scores.append(cls_scores)
+        if classifier_info[2] is not None:
+            classifier_out = classifier_info[2]
+            if self.use_regional_embedding:
+                cap_cls, regional_embeddings = classifier_out
+            else:
+                cap_cls = classifier_out
+            if self.sync_caption_batch:
+                caption_scores = self.cls_score(x, classifier=cap_cls[:, :-1])
+            else:
+                caption_scores = self.cls_score(x, classifier=cap_cls)
+            scores.append(caption_scores)
+        scores = torch.cat(scores, dim=1) # B x C' or B x N or B x (C'+N)
+        proposal_deltas = self.bbox_pred(x)
+        if self.with_softmax_prop:
+            prop_score = self.prop_score(x)
+            return scores, proposal_deltas, prop_score
+        elif self.use_regional_embedding:
+            # NOTE: scores: [B * # proposals for each image, 1204 (1203 + 1 bg)]
+            # NOTE: proposal_deltas: [B * # proposals for each image, 4]
+            # NOTE: regional_embeddings: [B * # proposals for each image, 512]
+            return scores, proposal_deltas, regional_embeddings
+        else:
+            return scores, proposal_deltas
+    def _caption_loss(self, score, classifier_info, idx, B):
+        assert (classifier_info[2] is not None)
+        assert self.add_image_box
+        cls_and_cap_num = score.shape[1]
+        cap_num = classifier_info[2].shape[0]
+        score, caption_score = score.split(
+            [cls_and_cap_num - cap_num, cap_num], dim=1)
+        # n x (C + 1), n x B
+        caption_score = caption_score[-1:] # 1 x B # -1: image level box
+        caption_target = caption_score.new_zeros(
+            caption_score.shape) # 1 x B or 1 x MB, M: num machines
+        if self.sync_caption_batch:
+            # caption_target: 1 x MB
+            rank = comm.get_rank()
+            global_idx = B * rank + idx
+            assert (classifier_info[2][
+                global_idx, -1] - rank) ** 2 < 1e-8, \
+                    '{} {} {} {} {}'.format(
+                        rank, global_idx,
+                        classifier_info[2][global_idx, -1],
+                        classifier_info[2].shape,
+                        classifier_info[2][:, -1])
+            caption_target[:, global_idx] = 1.
+        else:
+            assert caption_score.shape[1] == B
+            caption_target[:, idx] = 1.
+        caption_loss_img = F.binary_cross_entropy_with_logits(
+                caption_score, caption_target, reduction='none')
+        if self.sync_caption_batch:
+            fg_mask = (caption_target > 0.5).float()
+            assert (fg_mask.sum().item() - 1.) ** 2 < 1e-8, '{} {}'.format(
+                fg_mask.shape, fg_mask)
+            pos_loss = (caption_loss_img * fg_mask).sum()
+            neg_loss = (caption_loss_img * (1. - fg_mask)).sum()
+            caption_loss_img = pos_loss + self.neg_cap_weight * neg_loss
+        else:
+            caption_loss_img = caption_loss_img.sum()
+        return score, caption_loss_img
+    def _wsddn_loss(self, score, prop_score, label):
+        assert prop_score is not None
+        loss = 0
+        final_score = score.sigmoid() * \
+            F.softmax(prop_score, dim=0) # B x (C + 1)
+        img_score = torch.clamp(
+            torch.sum(final_score, dim=0),
+            min=1e-10, max=1-1e-10) # (C + 1)
+        target = img_score.new_zeros(img_score.shape) # (C + 1)
+        target[label] = 1.
+        loss += F.binary_cross_entropy(img_score, target)
+        ind = final_score[:, label].argmax()
+        return loss, ind
+    def _max_score_loss(self, score, label):
+        loss = 0
+        target = score.new_zeros(score.shape[1])
+        target[label] = 1.
+        ind = score[:, label].argmax().item()
+        loss += F.binary_cross_entropy_with_logits(
+            score[ind], target, reduction='sum')
+        return loss, ind
+    def _min_loss_loss(self, score, label):
+        loss = 0
+        target = score.new_zeros(score.shape)
+        target[:, label] = 1.
+        with torch.no_grad():
+            x = F.binary_cross_entropy_with_logits(
+                score, target, reduction='none').sum(dim=1) # n
+        ind = x.argmin().item()
+        loss += F.binary_cross_entropy_with_logits(
+            score[ind], target[0], reduction='sum')
+        return loss, ind
+    def _first_loss(self, score, label):
+        loss = 0
+        target = score.new_zeros(score.shape[1])
+        target[label] = 1.
+        ind = 0
+        loss += F.binary_cross_entropy_with_logits(
+            score[ind], target, reduction='sum')
+        return loss, ind
+    def _image_loss(self, score, label):
+        assert self.add_image_box
+        target = score.new_zeros(score.shape[1])
+        target[label] = 1.
+        ind = score.shape[0] - 1
+        loss = F.binary_cross_entropy_with_logits(
+            score[ind], target, reduction='sum')
+        return loss, ind
+    def _max_size_loss(self, score, label, p):
+        loss = 0
+        target = score.new_zeros(score.shape[1])
+        target[label] = 1.
+        sizes = p.proposal_boxes.area()
+        ind = sizes[:-1].argmax().item() if len(sizes) > 1 else 0
+        if self.softmax_weak_loss:
+            loss += F.cross_entropy(
+                score[ind:ind+1],
+                score.new_tensor(label, dtype=torch.long).view(1),
+                reduction='sum')
+        else:
+            loss += F.binary_cross_entropy_with_logits(
+                score[ind], target, reduction='sum')
+        return loss, ind
+def put_label_distribution(storage, hist_name, hist_counts, num_classes):
+    """
+    """
+    ht_min, ht_max = 0, num_classes
+    hist_edges = torch.linspace(
+        start=ht_min, end=ht_max, steps=num_classes + 1, dtype=torch.float32)
+    hist_params = dict(
+        tag=hist_name,
+        min=ht_min,
+        max=ht_max,
+        num=float(hist_counts.sum()),
+        sum=float((hist_counts * torch.arange(len(hist_counts))).sum()),
+        sum_squares=float(((hist_counts * torch.arange(len(hist_counts))) ** 2).sum()),
+        bucket_limits=hist_edges[1:].tolist(),
+        bucket_counts=hist_counts.tolist(),
+        global_step=storage._iter,
+    )
+    storage._histograms.append(hist_params)

proxydet/modeling/roi_heads/proxydet_roi_heads.py ADDED Viewed

	@@ -0,0 +1,556 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+'''
+    Modifications Copyright (c) 2024-present NAVER Corp, Apache License v2.0
+    original source: https://github.com/facebookresearch/Detic/blob/main/detic/modeling/roi_heads/detic_roi_heads.py
+'''
+import copy
+import numpy as np
+import json
+import math
+import torch
+from torch import nn
+from torch.autograd.function import Function
+from typing import Dict, List, Optional, Tuple, Union
+from torch.nn import functional as F
+from fvcore.nn import giou_loss
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec
+from detectron2.layers import batched_nms, cat
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference
+from detectron2.modeling.roi_heads.roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads
+from detectron2.modeling.roi_heads.cascade_rcnn import CascadeROIHeads, _ScaleGradient
+from detectron2.modeling.roi_heads.box_head import build_box_head
+from .proxydet_fast_rcnn import ProxydetFastRCNNOutputLayers
+from ..debug import debug_second_stage
+from torch.cuda.amp import autocast
+from copy import deepcopy
+@ROI_HEADS_REGISTRY.register()
+class ProxydetCascadeROIHeads(CascadeROIHeads):
+    @configurable
+    def __init__(
+        self,
+        *,
+        mult_proposal_score: bool = False,
+        with_image_labels: bool = False,
+        add_image_box: bool = False,
+        image_box_size: float = 1.0,
+        ws_num_props: int = 512,
+        add_feature_to_prop: bool = False,
+        mask_weight: float = 1.0,
+        one_class_per_proposal: bool = False,
+        use_regional_embedding: bool = False,
+        base_cat_mask: str = None,
+        cmm_stage: list = [],
+        cmm_stage_test: list = None,
+        cmm_beta: float = 1.0,
+        cmm_loss: str = "l1",
+        cmm_loss_weight: float = 1.0,
+        cmm_separated_branch: bool = False,
+        cmm_base_alpha: float = 0.5,
+        cmm_novel_beta: float = 0.5,
+        cmm_use_inl: bool = False,
+        cmm_prototype: str = "center",
+        cmm_prototype_temp: float = 1.0,
+        cmm_classifier_temp: float = None,
+        cmm_use_sigmoid_ce: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.mult_proposal_score = mult_proposal_score
+        self.with_image_labels = with_image_labels
+        self.add_image_box = add_image_box
+        self.image_box_size = image_box_size
+        self.ws_num_props = ws_num_props
+        self.add_feature_to_prop = add_feature_to_prop
+        self.mask_weight = mask_weight
+        self.one_class_per_proposal = one_class_per_proposal
+        self.use_regional_embedding = use_regional_embedding
+        self.base_cat_mask = torch.tensor(np.load(base_cat_mask)).bool()
+        self.cmm_stage = cmm_stage
+        self.cmm_stage_test = cmm_stage_test
+        self.cmm_beta = cmm_beta
+        self.cmm_loss = cmm_loss
+        self.cmm_loss_weight = cmm_loss_weight
+        self.cmm_separated_branch = cmm_separated_branch
+        self.cmm_base_alpha = cmm_base_alpha
+        self.cmm_novel_beta = cmm_novel_beta
+        self.cmm_use_inl = cmm_use_inl
+        self.cmm_prototype = cmm_prototype
+        self.cmm_prototype_temp = cmm_prototype_temp
+        self.cmm_classifier_temp = cmm_classifier_temp
+        self.cmm_use_sigmoid_ce = cmm_use_sigmoid_ce
+        if self.cmm_separated_branch:
+            self.box_head_cmm = deepcopy(self.box_head)
+            self.box_predictor_cmm = deepcopy(self.box_predictor)
+            if self.cmm_classifier_temp is not None:
+                for k in range(self.num_cascade_stages):
+                    self.box_predictor_cmm[k].cls_score.norm_temperature = self.cmm_classifier_temp
+            if not self.cmm_use_sigmoid_ce:
+                for k in range(self.num_cascade_stages):
+                    self.box_predictor_cmm[k].use_sigmoid_ce = self.cmm_use_sigmoid_ce # using bce or ce
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        ret.update({
+            'mult_proposal_score': cfg.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE,
+            'with_image_labels': cfg.WITH_IMAGE_LABELS,
+            'add_image_box': cfg.MODEL.ROI_BOX_HEAD.ADD_IMAGE_BOX,
+            'image_box_size': cfg.MODEL.ROI_BOX_HEAD.IMAGE_BOX_SIZE,
+            'ws_num_props': cfg.MODEL.ROI_BOX_HEAD.WS_NUM_PROPS,
+            'add_feature_to_prop': cfg.MODEL.ROI_BOX_HEAD.ADD_FEATURE_TO_PROP,
+            'mask_weight': cfg.MODEL.ROI_HEADS.MASK_WEIGHT,
+            'one_class_per_proposal': cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL,
+            "use_regional_embedding": cfg.MODEL.ROI_BOX_HEAD.USE_REGIONAL_EMBEDDING,
+            "base_cat_mask": cfg.MODEL.ROI_HEADS.BASE_CAT_MASK,
+            "cmm_stage": cfg.MODEL.ROI_HEADS.CMM.MIXUP_STAGE,
+            "cmm_stage_test": cfg.MODEL.ROI_HEADS.CMM.MIXUP_STAGE_TEST,
+            "cmm_beta": cfg.MODEL.ROI_HEADS.CMM.MIXUP_BETA,
+            "cmm_loss": cfg.MODEL.ROI_HEADS.CMM.LOSS,
+            "cmm_loss_weight": cfg.MODEL.ROI_HEADS.CMM.LOSS_WEIGHT,
+            "cmm_separated_branch": cfg.MODEL.ROI_HEADS.CMM.SEPARATED_BRANCH,
+            "cmm_base_alpha": cfg.MODEL.ROI_HEADS.CMM.BASE_ALPHA,
+            "cmm_novel_beta": cfg.MODEL.ROI_HEADS.CMM.NOVEL_BETA,
+            "cmm_use_inl": cfg.MODEL.ROI_HEADS.CMM.USE_INL,
+            "cmm_prototype": cfg.MODEL.ROI_HEADS.CMM.PROTOTYPE,
+            "cmm_prototype_temp": cfg.MODEL.ROI_HEADS.CMM.PROTOTYPE_TEMP,
+            "cmm_classifier_temp": cfg.MODEL.ROI_HEADS.CMM.CLASSIFIER_TEMP,
+            "cmm_use_sigmoid_ce": cfg.MODEL.ROI_HEADS.CMM.USE_SIGMOID_CE,
+        })
+        return ret
+    @classmethod
+    def _init_box_head(self, cfg, input_shape):
+        ret = super()._init_box_head(cfg, input_shape)
+        del ret['box_predictors']
+        cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS
+        box_predictors = []
+        for box_head, bbox_reg_weights in zip(ret['box_heads'], \
+            cascade_bbox_reg_weights):
+            box_predictors.append(
+                ProxydetFastRCNNOutputLayers(
+                    cfg, box_head.output_shape,
+                    box2box_transform=Box2BoxTransform(weights=bbox_reg_weights)
+                ))
+        ret['box_predictors'] = box_predictors
+        return ret
+    def _embed_interp(self, re_i, re_j, te_i, te_j, lam):
+        # mix image, text embedding
+        _mixed_re = lam * re_i + (1 - lam) * re_j
+        _mixed_te = lam * te_i + (1 - lam) * te_j
+        return _mixed_re, _mixed_te
+    def _get_head_outputs(self, features, proposals, image_sizes, _run_stage, box_predictor, targets=None, ann_type='box', classifier_info=(None,None,None)):
+        head_outputs = []  # (predictor, predictions, proposals)
+        for k in range(self.num_cascade_stages):
+            if k > 0:
+                proposals = self._create_proposals_from_boxes(
+                    prev_pred_boxes, image_sizes,
+                    logits=[p.objectness_logits for p in proposals])
+                if self.training and ann_type in ['box']:
+                    proposals = self._match_and_label_boxes(
+                        proposals, k, targets)
+            predictions = _run_stage(features, proposals, k,
+                classifier_info=classifier_info)
+            prev_pred_boxes = box_predictor[k].predict_boxes(
+                (predictions[0], predictions[1]), proposals)
+            head_outputs.append((box_predictor[k], predictions, proposals))
+        return head_outputs, proposals
+    def loss_mixup(self, mixed_re, mixed_te, text_embeddings):
+        # loss
+        if self.cmm_loss in ["l1", "l2"]:
+            if self.cmm_loss == "l1":
+                loss_type = F.l1_loss
+            elif self.cmm_loss == "l2":
+                loss_type = F.mse_loss
+            cmm_loss = loss_type(mixed_re, mixed_te)
+        else:
+            raise ValueError("No such loss is supported : ", self.cmm_loss)
+        return cmm_loss
+    def mixup(self, stage, regional_embeddings, text_embeddings, gt_classes, proto_weights=None):
+        # class-wise multi-modal mixup
+        try:
+            neg_class = text_embeddings.shape[0] - 1
+            # select positive text embeddings
+            all_classes = torch.unique(gt_classes)
+            pos_classes = all_classes[all_classes != neg_class]
+            # select class-wise regional embeddings & text embeddings
+            clswise_re = []
+            clswise_te = []
+            for p_c in pos_classes:
+                mask = (gt_classes == p_c)
+                if self.cmm_prototype == "center":
+                    _clswise_re = torch.mean(regional_embeddings[mask], axis=0, keepdim=True)
+                elif self.cmm_prototype in ["obj_score", "iou"]:
+                    soft_proto_weights = F.softmax(proto_weights[mask] / self.cmm_prototype_temp, dim=0)
+                    _clswise_re = torch.sum(regional_embeddings[mask] * soft_proto_weights.unsqueeze(-1), 0, keepdim=True)
+                _clswise_te = text_embeddings[int(p_c.item())].unsqueeze(0)
+                clswise_re.append(_clswise_re)
+                clswise_te.append(_clswise_te)
+            if len(clswise_re) == 0:
+                raise ValueError("no positive base classes found for mixup.")
+            clswise_re = torch.cat(clswise_re, dim=0)
+            clswise_re = F.normalize(clswise_re, p=2, dim=1) # re-normalize
+            clswise_te = torch.cat(clswise_te, dim=0)
+            if self.cmm_beta == 0:
+                lam = float(np.random.randint(2))
+            else:
+                lam = np.random.beta(self.cmm_beta, self.cmm_beta)
+            # random shuffle for mixup pair
+            rand_index = torch.randperm(clswise_re.size()[0]).to(clswise_re.device)
+            # mixup
+            sf_clswise_re = clswise_re[rand_index]
+            sf_clswise_te = clswise_te[rand_index]
+            mixed_re, mixed_te = self._embed_interp(clswise_re, sf_clswise_re, clswise_te, sf_clswise_te, lam)
+            mixed_re = F.normalize(mixed_re, p=2, dim=1)
+            mixed_te = F.normalize(mixed_te, p=2, dim=1)
+            cmm_loss = self.loss_mixup(mixed_re, mixed_te, text_embeddings)
+        except Exception as e:
+            print("Caught this error in mixup: " + repr(e), "Thus skipping current batch w/o mixup...")
+            cmm_loss = text_embeddings[0].new_zeros([1])[0]
+        return cmm_loss
+    def _forward_box(self, features, proposals, targets=None,
+        ann_type='box', classifier_info=(None,None,None)):
+        """
+        Add mult proposal scores at testing
+        Add ann_type
+        """
+        if (not self.training) and self.mult_proposal_score:
+            if len(proposals) > 0 and proposals[0].has('scores'):
+                proposal_scores = [p.get('scores') for p in proposals]
+            else:
+                proposal_scores = [p.get('objectness_logits') for p in proposals]
+        features = [features[f] for f in self.box_in_features]
+        # head_outputs = []  # (predictor, predictions, proposals)
+        prev_pred_boxes = None
+        image_sizes = [x.image_size for x in proposals]
+        head_outputs, proposals = self._get_head_outputs(features, proposals, image_sizes, self._run_stage, self.box_predictor, targets, ann_type, classifier_info)
+        if self.cmm_separated_branch:
+            # separated forward
+            head_outputs_cmm, proposals_cmm = self._get_head_outputs(features, proposals, image_sizes, self._run_stage_cmm, self.box_predictor_cmm, targets, ann_type, classifier_info)
+        if self.training:
+            losses = {}
+            storage = get_event_storage()
+            for stage, (predictor, predictions, proposals) in enumerate(head_outputs):
+                with storage.name_scope("stage{}".format(stage)):
+                    if ann_type != 'box':
+                        stage_losses = {}
+                        if ann_type in ['image', 'caption', 'captiontag']:
+                            image_labels = [x._pos_category_ids for x in targets]
+                            weak_losses = predictor.image_label_losses(
+                                predictions, proposals, image_labels,
+                                classifier_info=classifier_info,
+                                ann_type=ann_type)
+                            stage_losses.update(weak_losses)
+                            if self.cmm_use_inl and len(self.cmm_stage) > 0 and stage in self.cmm_stage:
+                                if self.cmm_separated_branch:
+                                    # get regional embeddings (l2 normalized) from separated branch
+                                    regional_embeddings = head_outputs_cmm[stage][1][2]
+                                else:
+                                    # get regional embeddings (l2 normalized)
+                                    regional_embeddings = predictions[2]
+                                # get text embeddings (L2 normalized), [C (1203 + 1), embedding dim]
+                                text_embeddings = predictor.cls_score.zs_weight.t()
+                                # get max-size proposal's regional embedding, per image
+                                num_inst_per_image = [len(p) for p in proposals_cmm]
+                                re_per_image = regional_embeddings.split(num_inst_per_image, dim=0)
+                                maxsize_re_per_image = []
+                                for p, re in zip(proposals_cmm, re_per_image):
+                                    sizes = p.proposal_boxes.area()
+                                    ind = sizes[:-1].argmax().item() if len(sizes) > 1 else 0
+                                    maxsize_re_per_image.append(re[ind].unsqueeze(0))
+                                maxsize_re_per_image = torch.cat(maxsize_re_per_image, dim=0)
+                                maxsize_re_per_image = maxsize_re_per_image.to(regional_embeddings.device)
+                                # get gt classes per max-size proposal
+                                # TODO: add best-label per image by cls loss from weak_losses (image-label loss)
+                                # NOTE: image_labels are not multi-labels.
+                                gt_classes = (
+                                    regional_embeddings.new_tensor(
+                                        [np.random.choice(labels, 1, replace=False)[0] for labels in image_labels],
+                                        dtype=torch.long
+                                    ) if len(proposals_cmm)
+                                    else torch.empty(0)
+                                )
+                                proto_weights = None
+                                # get text embeddings (L2 normalized), [C (1203 + 1), embedding dim]
+                                text_embeddings = predictor.cls_score.zs_weight.t()
+                                cmm_loss = self.mixup(stage, maxsize_re_per_image, text_embeddings, gt_classes, proto_weights)
+                                stage_losses["cmm_image_loss"] = (
+                                    cmm_loss * self.cmm_loss_weight
+                                )
+                                stage_losses["cmm_loss"] = \
+                                    predictions[0].new_zeros([1])[0]
+                    else: # supervised
+                        stage_losses = predictor.losses(
+                            (predictions[0], predictions[1]), proposals,
+                            classifier_info=classifier_info)
+                        if self.with_image_labels:
+                            stage_losses['image_loss'] = \
+                                predictions[0].new_zeros([1])[0]
+                        if len(self.cmm_stage) > 0 and stage in self.cmm_stage:
+                            assert self.use_regional_embedding
+                            # get gt classes per proposal
+                            # e.g. dtype: torch.int64, value: tensor([ 142,  111,  142,  ..., 1203, 1203, 1203], device='cuda:6')
+                            gt_classes = (
+                                cat([p.gt_classes for p in proposals_cmm], dim=0)
+                                if len(proposals_cmm)
+                                else torch.empty(0)
+                            )
+                            if self.cmm_prototype in ["obj_score"]:
+                                proto_weights = (
+                                    cat([p.objectness_logits for p in proposals_cmm], dim=0)
+                                    if len(proposals_cmm)
+                                    else torch.empty(0)
+                                )
+                            elif self.cmm_prototype in ["iou"]:
+                                gt_boxes = (
+                                    cat([p.gt_boxes.tensor for p in proposals_cmm], dim=0)
+                                    if len(proposals_cmm)
+                                    else torch.empty(0)
+                                )
+                                proposal_boxes = (
+                                    cat([p.proposal_boxes.tensor for p in proposals_cmm], dim=0)
+                                    if len(proposals_cmm)
+                                    else torch.empty(0)
+                                )
+                                proto_weights = 1 - giou_loss(proposal_boxes, gt_boxes, reduction="none") # GIoU. (-1 < x < 1)
+                            else:
+                                proto_weights = None
+                            if self.cmm_separated_branch:
+                                # get regional embeddings (l2 normalized) from separated branch
+                                regional_embeddings = head_outputs_cmm[stage][1][2]
+                            else:
+                                # get regional embeddings (l2 normalized)
+                                regional_embeddings = predictions[2]
+                            # get text embeddings (L2 normalized), [C (1203 + 1), embedding dim]
+                            text_embeddings = predictor.cls_score.zs_weight.t()
+                            cmm_loss = self.mixup(stage, regional_embeddings, text_embeddings, gt_classes, proto_weights)
+                            stage_losses["cmm_loss"] = (
+                                cmm_loss * self.cmm_loss_weight
+                            )
+                            if self.cmm_use_inl:
+                                stage_losses["cmm_image_loss"] = \
+                                        predictions[0].new_zeros([1])[0]
+                losses.update({k + "_stage{}".format(stage): v \
+                    for k, v in stage_losses.items()})
+            return losses
+        else:
+            # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1)
+            scores_per_stage = [h[0].predict_probs(h[1], h[2]) for h in head_outputs]
+            scores = [
+                sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages)
+                for scores_per_image in zip(*scores_per_stage)
+            ]
+            if self.cmm_separated_branch:
+                # scores from separated branch
+                if self.cmm_stage_test is None:
+                    # average all stage's classification scores
+                    scores_per_stage_cmm = [h[0].predict_probs(h[1], h[2]) for h in head_outputs_cmm]
+                    scores_cmm = [
+                        sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages)
+                        for scores_per_image in zip(*scores_per_stage_cmm)
+                    ]
+                else:
+                    # only using specific stages
+                    scores_per_stage_cmm = [h[0].predict_probs(h[1], h[2]) for k, h in enumerate(head_outputs_cmm) if k in self.cmm_stage_test]
+                    scores_cmm = [
+                        sum(list(scores_per_image)) * (1.0 / len(self.cmm_stage_test))
+                        for scores_per_image in zip(*scores_per_stage_cmm)
+                    ]
+                base_cat_mask = self.base_cat_mask
+                assert len(scores) == 1
+                bg_score = scores[0][:, -1].clone()
+                scores[0][:, base_cat_mask] = scores[0][:, base_cat_mask].pow(
+                    1.0 - self.cmm_base_alpha
+                ) * scores_cmm[0][:, base_cat_mask].pow(self.cmm_base_alpha)
+                scores[0][:, ~base_cat_mask] = scores[0][:, ~base_cat_mask].pow(
+                    1.0 - self.cmm_novel_beta
+                ) * scores_cmm[0][:, ~base_cat_mask].pow(self.cmm_novel_beta)
+                scores[0][:, -1] = bg_score
+            if self.mult_proposal_score:
+                scores = [(s * ps[:, None]) ** 0.5 \
+                    for s, ps in zip(scores, proposal_scores)]
+            if self.one_class_per_proposal:
+                scores = [s * (s == s[:, :-1].max(dim=1)[0][:, None]).float() for s in scores]
+            predictor, predictions, proposals = head_outputs[-1]
+            boxes = predictor.predict_boxes(
+                (predictions[0], predictions[1]), proposals)
+            pred_instances, _ = fast_rcnn_inference(
+                boxes,
+                scores,
+                image_sizes,
+                predictor.test_score_thresh,
+                predictor.test_nms_thresh,
+                predictor.test_topk_per_image,
+            )
+            return pred_instances
+    def forward(self, images, features, proposals, targets=None,
+        ann_type='box', classifier_info=(None,None,None)):
+        '''
+        enable debug and image labels
+        classifier_info is shared across the batch
+        '''
+        if self.training:
+            if ann_type in ['box', 'prop', 'proptag']:
+                proposals = self.label_and_sample_proposals(
+                    proposals, targets)
+            else:
+                proposals = self.get_top_proposals(proposals)
+            losses = self._forward_box(features, proposals, targets, \
+                ann_type=ann_type, classifier_info=classifier_info)
+            if ann_type == 'box' and targets[0].has('gt_masks'):
+                mask_losses = self._forward_mask(features, proposals)
+                losses.update({k: v * self.mask_weight \
+                    for k, v in mask_losses.items()})
+                losses.update(self._forward_keypoint(features, proposals))
+            else:
+                losses.update(self._get_empty_mask_loss(
+                    features, proposals,
+                    device=proposals[0].objectness_logits.device))
+            return proposals, losses
+        else:
+            pred_instances = self._forward_box(
+                features, proposals, classifier_info=classifier_info)
+            pred_instances = self.forward_with_given_boxes(features, pred_instances)
+            return pred_instances, {}
+    def get_top_proposals(self, proposals):
+        for i in range(len(proposals)):
+            proposals[i].proposal_boxes.clip(proposals[i].image_size)
+        proposals = [p[:self.ws_num_props] for p in proposals]
+        for i, p in enumerate(proposals):
+            p.proposal_boxes.tensor = p.proposal_boxes.tensor.detach()
+            if self.add_image_box:
+                proposals[i] = self._add_image_box(p)
+        return proposals
+    def _add_image_box(self, p):
+        image_box = Instances(p.image_size)
+        n = 1
+        h, w = p.image_size
+        f = self.image_box_size
+        image_box.proposal_boxes = Boxes(
+            p.proposal_boxes.tensor.new_tensor(
+                [w * (1. - f) / 2.,
+                    h * (1. - f) / 2.,
+                    w * (1. - (1. - f) / 2.),
+                    h * (1. - (1. - f) / 2.)]
+                ).view(n, 4))
+        image_box.objectness_logits = p.objectness_logits.new_ones(n)
+        return Instances.cat([p, image_box])
+    def _get_empty_mask_loss(self, features, proposals, device):
+        if self.mask_on:
+            return {'loss_mask': torch.zeros(
+                (1, ), device=device, dtype=torch.float32)[0]}
+        else:
+            return {}
+    def _create_proposals_from_boxes(self, boxes, image_sizes, logits):
+        """
+        Add objectness_logits
+        """
+        boxes = [Boxes(b.detach()) for b in boxes]
+        proposals = []
+        for boxes_per_image, image_size, logit in zip(
+            boxes, image_sizes, logits):
+            boxes_per_image.clip(image_size)
+            if self.training:
+                inds = boxes_per_image.nonempty()
+                boxes_per_image = boxes_per_image[inds]
+                logit = logit[inds]
+            prop = Instances(image_size)
+            prop.proposal_boxes = boxes_per_image
+            prop.objectness_logits = logit
+            proposals.append(prop)
+        return proposals
+    def _run_stage(self, features, proposals, stage, \
+        classifier_info=(None,None,None)):
+        """
+        Support classifier_info and add_feature_to_prop
+        """
+        pool_boxes = [x.proposal_boxes for x in proposals]
+        box_features = self.box_pooler(features, pool_boxes)
+        box_features = _ScaleGradient.apply(box_features, 1.0 / self.num_cascade_stages)
+        box_features = self.box_head[stage](box_features)
+        if self.add_feature_to_prop:
+            feats_per_image = box_features.split(
+                [len(p) for p in proposals], dim=0)
+            for feat, p in zip(feats_per_image, proposals):
+                p.feat = feat
+        return self.box_predictor[stage](
+            box_features,
+            classifier_info=classifier_info)
+    def _run_stage_cmm(self, features, proposals, stage, \
+        classifier_info=(None,None,None)):
+        """
+        Support classifier_info and add_feature_to_prop
+        """
+        pool_boxes = [x.proposal_boxes for x in proposals]
+        box_features = self.box_pooler(features, pool_boxes)
+        box_features = _ScaleGradient.apply(box_features, 1.0 / self.num_cascade_stages)
+        box_features = self.box_head_cmm[stage](box_features)
+        if self.add_feature_to_prop:
+            feats_per_image = box_features.split(
+                [len(p) for p in proposals], dim=0)
+            for feat, p in zip(feats_per_image, proposals):
+                p.feat = feat
+        return self.box_predictor_cmm[stage](
+            box_features,
+            classifier_info=classifier_info)

proxydet/modeling/roi_heads/zero_shot_classifier.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+'''
+    Modifications Copyright (c) 2024-present NAVER Corp, Apache License v2.0
+    original source: https://github.com/facebookresearch/Detic/blob/main/detic/modeling/roi_heads/zero_shot_classifier.py
+'''
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.layers import Linear, ShapeSpec
+class ZeroShotClassifier(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        input_shape: ShapeSpec,
+        *,
+        num_classes: int,
+        zs_weight_path: str,
+        zs_weight_dim: int = 512,
+        use_bias: float = 0.0,
+        norm_weight: bool = True,
+        norm_temperature: float = 50.0,
+        use_regional_embedding: bool = False
+    ):
+        super().__init__()
+        if isinstance(input_shape, int):  # some backward compatibility
+            input_shape = ShapeSpec(channels=input_shape)
+        input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
+        self.norm_weight = norm_weight
+        self.norm_temperature = norm_temperature
+        self.use_bias = use_bias < 0
+        if self.use_bias:
+            self.cls_bias = nn.Parameter(torch.ones(1) * use_bias)
+        self.linear = nn.Linear(input_size, zs_weight_dim)
+        if zs_weight_path == 'rand':
+            zs_weight = torch.randn((zs_weight_dim, num_classes))
+            nn.init.normal_(zs_weight, std=0.01)
+        else:
+            zs_weight = torch.tensor(
+                np.load(zs_weight_path),
+                dtype=torch.float32).permute(1, 0).contiguous() # D x C
+        zs_weight = torch.cat(
+            [zs_weight, zs_weight.new_zeros((zs_weight_dim, 1))],
+            dim=1) # D x (C + 1)
+        if self.norm_weight:
+            zs_weight = F.normalize(zs_weight, p=2, dim=0)
+        if zs_weight_path == 'rand':
+            self.zs_weight = nn.Parameter(zs_weight)
+        else:
+            self.register_buffer('zs_weight', zs_weight)
+        assert self.zs_weight.shape[1] == num_classes + 1, self.zs_weight.shape
+        self.use_regional_embedding = use_regional_embedding
+        if self.use_regional_embedding:
+            assert (
+                self.norm_weight
+            ), "norm_weight should be True for using regional embedding."
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'input_shape': input_shape,
+            'num_classes': cfg.MODEL.ROI_HEADS.NUM_CLASSES,
+            'zs_weight_path': cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH,
+            'zs_weight_dim': cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_DIM,
+            'use_bias': cfg.MODEL.ROI_BOX_HEAD.USE_BIAS,
+            'norm_weight': cfg.MODEL.ROI_BOX_HEAD.NORM_WEIGHT,
+            'norm_temperature': cfg.MODEL.ROI_BOX_HEAD.NORM_TEMP,
+            "use_regional_embedding": cfg.MODEL.ROI_BOX_HEAD.USE_REGIONAL_EMBEDDING,
+        }
+    def forward(self, x, classifier=None):
+        '''
+        Inputs:
+            x: B x D'
+            classifier_info: (C', C' x D)
+        '''
+        x = self.linear(x)
+        if classifier is not None:
+            zs_weight = classifier.permute(1, 0).contiguous() # D x C'
+            zs_weight = F.normalize(zs_weight, p=2, dim=0) \
+                if self.norm_weight else zs_weight
+        else:
+            zs_weight = self.zs_weight
+        if self.norm_weight:
+            # NOTE: x shape is [Batch size * # proposals for each image, 512 (embedding dim)]
+            x = F.normalize(x, p=2, dim=1)
+            if self.use_regional_embedding:
+                # NOTE: gradient of cloned tensor will be propagated to the original tensor (x): https://discuss.pytorch.org/t/how-does-clone-interact-with-backpropagation/8247/6?u=111368
+                regional_embedding = x.clone()
+            # NOTE: apply normalizing temperature
+            x *= self.norm_temperature
+        x = torch.mm(x, zs_weight)
+        if self.use_bias:
+            x = x + self.cls_bias
+        if not self.use_regional_embedding:
+            return x  # class logits
+        else:
+            return x, regional_embedding  # class logits & regional embeddings

proxydet/modeling/text/text_encoder.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# This code is modified from https://github.com/openai/CLIP/blob/main/clip/clip.py
+# Modified by Xingyi Zhou
+# The original code is under MIT license
+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import Union, List
+from collections import OrderedDict
+import torch
+from torch import nn
+import torch
+from clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
+__all__ = ["tokenize"]
+count = 0
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(
+            *[ResidualAttentionBlock(width, heads, attn_mask) \
+                for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class CLIPTEXT(nn.Module):
+    def __init__(self,
+                 embed_dim=512,
+                 # text
+                 context_length=77,
+                 vocab_size=49408,
+                 transformer_width=512,
+                 transformer_heads=8,
+                 transformer_layers=12
+                 ):
+        super().__init__()
+        self._tokenizer = _Tokenizer()
+        self.context_length = context_length
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        # self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def device(self):
+        return self.text_projection.device
+    @property
+    def dtype(self):
+        return self.text_projection.dtype
+    def tokenize(self,
+        texts: Union[str, List[str]], \
+        context_length: int = 77) -> torch.LongTensor:
+        """
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+        sot_token = self._tokenizer.encoder["<|startoftext|>"]
+        eot_token = self._tokenizer.encoder["<|endoftext|>"]
+        all_tokens = [[sot_token] + self._tokenizer.encode(text) + [eot_token] for text in texts]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                st = torch.randint(
+                    len(tokens) - context_length + 1, (1,))[0].item()
+                tokens = tokens[st: st + context_length]
+                # raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+            result[i, :len(tokens)] = torch.tensor(tokens)
+        return result
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, captions):
+        '''
+        captions: list of strings
+        '''
+        text = self.tokenize(captions).to(self.device) # B x L x D
+        features = self.encode_text(text) # B x D
+        return features
+def build_text_encoder(pretrain=True):
+    text_encoder = CLIPTEXT()
+    if pretrain:
+        import clip
+        pretrained_model, _ = clip.load("ViT-B/32", device='cpu')
+        state_dict = pretrained_model.state_dict()
+        to_delete_keys = ["logit_scale", "input_resolution", \
+        "context_length", "vocab_size"] + \
+            [k for k in state_dict.keys() if k.startswith('visual.')]
+        for k in to_delete_keys:
+            if k in state_dict:
+                del state_dict[k]
+        print('Loading pretrained CLIP')
+        text_encoder.load_state_dict(state_dict)
+    # import pdb; pdb.set_trace()
+    return text_encoder

proxydet/modeling/utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+import json
+import numpy as np
+from torch.nn import functional as F
+def load_class_freq(
+    path='datasets/metadata/lvis_v1_train_cat_info.json', freq_weight=1.0):
+    cat_info = json.load(open(path, 'r'))
+    cat_info = torch.tensor(
+        [c['image_count'] for c in sorted(cat_info, key=lambda x: x['id'])])
+    freq_weight = cat_info.float() ** freq_weight
+    return freq_weight
+def get_fed_loss_inds(gt_classes, num_sample_cats, C, weight=None):
+    appeared = torch.unique(gt_classes) # C'
+    prob = appeared.new_ones(C + 1).float()
+    prob[-1] = 0
+    if len(appeared) < num_sample_cats:
+        if weight is not None:
+            prob[:C] = weight.float().clone()
+        prob[appeared] = 0
+        more_appeared = torch.multinomial(
+            prob, num_sample_cats - len(appeared),
+            replacement=False)
+        appeared = torch.cat([appeared, more_appeared])
+    return appeared
+def reset_cls_test(model, cls_path, num_classes):
+    model.roi_heads.num_classes = num_classes
+    if type(cls_path) == str:
+        print('Resetting zs_weight', cls_path)
+        zs_weight = torch.tensor(
+            np.load(cls_path),
+            dtype=torch.float32).permute(1, 0).contiguous() # D x C
+    else:
+        zs_weight = cls_path
+    zs_weight = torch.cat(
+        [zs_weight, zs_weight.new_zeros((zs_weight.shape[0], 1))],
+        dim=1) # D x (C + 1)
+    if model.roi_heads.box_predictor[0].cls_score.norm_weight:
+        zs_weight = F.normalize(zs_weight, p=2, dim=0)
+    zs_weight = zs_weight.to(model.device)
+    for k in range(len(model.roi_heads.box_predictor)):
+        del model.roi_heads.box_predictor[k].cls_score.zs_weight
+        model.roi_heads.box_predictor[k].cls_score.zs_weight = zs_weight
+    if hasattr(model.roi_heads, "box_predictor_cmm"):
+        for k in range(len(model.roi_heads.box_predictor_cmm)):
+            del model.roi_heads.box_predictor_cmm[k].cls_score.zs_weight
+            model.roi_heads.box_predictor_cmm[k].cls_score.zs_weight = zs_weight

proxydet/predictor.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+'''
+    Modifications Copyright (c) 2024-present NAVER Corp, Apache License v2.0
+    original source: https://github.com/facebookresearch/Detic/blob/main/detic/predictor.py
+'''
+import atexit
+import bisect
+import numpy as np
+import multiprocessing as mp
+from collections import deque
+import cv2
+import torch
+import torch.nn.functional as F
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.utils.video_visualizer import VideoVisualizer
+from detectron2.utils.visualizer import ColorMode, Visualizer
+from .modeling.utils import reset_cls_test
+def get_text_encoder():
+    from proxydet.modeling.text.text_encoder import build_text_encoder
+    text_encoder = build_text_encoder(pretrain=True)
+    text_encoder.eval()
+    return text_encoder
+def get_clip_embeddings(vocabulary, text_encoder, prompt='a '):
+    texts = [prompt + x for x in vocabulary]
+    emb = text_encoder(texts).detach().permute(1, 0).contiguous().cpu()
+    return emb
+BUILDIN_CLASSIFIER = {
+    'lvis': 'datasets/metadata/lvis_v1_clip_a+cname.npy',
+    'objects365': 'datasets/metadata/o365_clip_a+cnamefix.npy',
+    'openimages': 'datasets/metadata/oid_clip_a+cname.npy',
+    'coco': 'datasets/metadata/coco_clip_a+cname.npy',
+}
+BUILDIN_METADATA_PATH = {
+    'lvis': 'lvis_v1_val',
+    'objects365': 'objects365_v2_val',
+    'openimages': 'oid_val_expanded',
+    'coco': 'coco_2017_val',
+}
+class VisualizationDemo(object):
+    def __init__(self, cfg, args,
+        instance_mode=ColorMode.IMAGE, parallel=False):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        if args.vocabulary == 'custom':
+            self.text_encoder = get_text_encoder()
+            self.metadata = MetadataCatalog.get("__unused")
+            self.metadata.thing_classes = args.custom_vocabulary.split(',')
+            classifier = get_clip_embeddings(self.metadata.thing_classes, self.text_encoder)
+        else:
+            self.metadata = MetadataCatalog.get(
+                BUILDIN_METADATA_PATH[args.vocabulary])
+            classifier = BUILDIN_CLASSIFIER[args.vocabulary]
+        num_classes = len(self.metadata.thing_classes)
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+        self.parallel = parallel
+        if parallel:
+            num_gpu = torch.cuda.device_count()
+            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+        else:
+            self.predictor = DefaultPredictor(cfg)
+        reset_cls_test(self.predictor.model, classifier, num_classes)
+        # reset base category mask, based on the similarity
+        self.zeroshot_weight_path = args.zeroshot_weight_path
+        self.base_cat_threshold = args.base_cat_threshold
+        if self.zeroshot_weight_path is not None:
+            self.trained_zs_classifier = torch.tensor(
+                np.load(self.zeroshot_weight_path),
+                dtype=torch.float32
+            ).permute(1, 0).contiguous().to(cfg.MODEL.DEVICE) # D x C
+            self.trained_zs_classifier = F.normalize(self.trained_zs_classifier, p=2, dim=0)
+            self.base_cat_indices = self.predictor.model.roi_heads.base_cat_mask.nonzero().squeeze(-1)
+            self.reset_base_cat_mask()
+    def reset_base_cat_mask(self, new_base_cat_mask=None):
+        if new_base_cat_mask is None:
+            # L2 normalized
+            if hasattr(self.predictor.model.roi_heads.box_predictor, "__getitem__"):
+                custom_classifier = self.predictor.model.roi_heads.box_predictor[0].cls_score.zs_weight
+            else:
+                custom_classifier = self.predictor.model.roi_heads.box_predictor.cls_score.zs_weight
+            base_cat_sim = custom_classifier.T[:-1, :] @ self.trained_zs_classifier[:, self.base_cat_indices]
+            # reset base cat mask
+            new_base_cat_mask = torch.cat([base_cat_sim.max(dim=1)[0] > self.base_cat_threshold, base_cat_sim.new_zeros((1))], dim=0) # for bg
+            new_base_cat_mask = new_base_cat_mask.bool()
+        self.predictor.model.roi_heads.base_cat_mask = new_base_cat_mask
+        return new_base_cat_mask
+    def reset_classifier(self, vocabs):
+        thing_classes = vocabs.split(',')
+        classifier = get_clip_embeddings(thing_classes, self.text_encoder)
+        reset_cls_test(self.predictor.model, classifier, len(thing_classes))
+    def run_on_image(self, image):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        vis_output = None
+        predictions = self.predictor(image)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
+        if "panoptic_seg" in predictions:
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            vis_output = visualizer.draw_panoptic_seg_predictions(
+                panoptic_seg.to(self.cpu_device), segments_info
+            )
+        else:
+            if "sem_seg" in predictions:
+                vis_output = visualizer.draw_sem_seg(
+                    predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            if "instances" in predictions:
+                instances = predictions["instances"].to(self.cpu_device)
+                vis_output = visualizer.draw_instance_predictions(predictions=instances)
+        return predictions, vis_output
+    def _frame_from_video(self, video):
+        while video.isOpened():
+            success, frame = video.read()
+            if success:
+                yield frame
+            else:
+                break
+    def run_on_video(self, video):
+        """
+        Visualizes predictions on frames of the input video.
+        Args:
+            video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
+                either a webcam or a video file.
+        Yields:
+            ndarray: BGR visualizations of each video frame.
+        """
+        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
+        def process_predictions(frame, predictions):
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            if "panoptic_seg" in predictions:
+                panoptic_seg, segments_info = predictions["panoptic_seg"]
+                vis_frame = video_visualizer.draw_panoptic_seg_predictions(
+                    frame, panoptic_seg.to(self.cpu_device), segments_info
+                )
+            elif "instances" in predictions:
+                predictions = predictions["instances"].to(self.cpu_device)
+                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
+            elif "sem_seg" in predictions:
+                vis_frame = video_visualizer.draw_sem_seg(
+                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            # Converts Matplotlib RGB format to OpenCV BGR format
+            vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
+            return vis_frame
+        frame_gen = self._frame_from_video(video)
+        if self.parallel:
+            buffer_size = self.predictor.default_buffer_size
+            frame_data = deque()
+            for cnt, frame in enumerate(frame_gen):
+                frame_data.append(frame)
+                self.predictor.put(frame)
+                if cnt >= buffer_size:
+                    frame = frame_data.popleft()
+                    predictions = self.predictor.get()
+                    yield process_predictions(frame, predictions)
+            while len(frame_data):
+                frame = frame_data.popleft()
+                predictions = self.predictor.get()
+                yield process_predictions(frame, predictions)
+        else:
+            for frame in frame_gen:
+                yield process_predictions(frame, self.predictor(frame))
+class AsyncPredictor:
+    """
+    A predictor that runs the model asynchronously, possibly on >1 GPUs.
+    Because rendering the visualization takes considerably amount of time,
+    this helps improve throughput a little bit when rendering videos.
+    """
+    class _StopToken:
+        pass
+    class _PredictWorker(mp.Process):
+        def __init__(self, cfg, task_queue, result_queue):
+            self.cfg = cfg
+            self.task_queue = task_queue
+            self.result_queue = result_queue
+            super().__init__()
+        def run(self):
+            predictor = DefaultPredictor(self.cfg)
+            while True:
+                task = self.task_queue.get()
+                if isinstance(task, AsyncPredictor._StopToken):
+                    break
+                idx, data = task
+                result = predictor(data)
+                self.result_queue.put((idx, result))
+    def __init__(self, cfg, num_gpus: int = 1):
+        """
+        Args:
+            cfg (CfgNode):
+            num_gpus (int): if 0, will run on CPU
+        """
+        num_workers = max(num_gpus, 1)
+        self.task_queue = mp.Queue(maxsize=num_workers * 3)
+        self.result_queue = mp.Queue(maxsize=num_workers * 3)
+        self.procs = []
+        for gpuid in range(max(num_gpus, 1)):
+            cfg = cfg.clone()
+            cfg.defrost()
+            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+            self.procs.append(
+                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+            )
+        self.put_idx = 0
+        self.get_idx = 0
+        self.result_rank = []
+        self.result_data = []
+        for p in self.procs:
+            p.start()
+        atexit.register(self.shutdown)
+    def put(self, image):
+        self.put_idx += 1
+        self.task_queue.put((self.put_idx, image))
+    def get(self):
+        self.get_idx += 1  # the index needed for this request
+        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+            res = self.result_data[0]
+            del self.result_data[0], self.result_rank[0]
+            return res
+        while True:
+            # make sure the results are returned in the correct order
+            idx, res = self.result_queue.get()
+            if idx == self.get_idx:
+                return res
+            insert = bisect.bisect(self.result_rank, idx)
+            self.result_rank.insert(insert, idx)
+            self.result_data.insert(insert, res)
+    def __len__(self):
+        return self.put_idx - self.get_idx
+    def __call__(self, image):
+        self.put(image)
+        return self.get()
+    def shutdown(self):
+        for _ in self.procs:
+            self.task_queue.put(AsyncPredictor._StopToken())
+    @property
+    def default_buffer_size(self):
+        return len(self.procs) * 5

requirements.txt CHANGED Viewed

@@ -7,10 +7,4 @@ mss<=6.1.0
 timm<=0.5.4
 lvis
 nltk<=3.7
-#git+https://github.com/facebookresearch/detectron2.git
 numpy>=1.18.5
-#torch>=1.7.0
-#torchvision>=0.8.1
-git+https://github.com/huggingface/transformers.git
-opencv-python

 timm<=0.5.4
 lvis
 nltk<=3.7
 numpy>=1.18.5

third_party/CenterNet2/.github/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# Code of Conduct
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.

third_party/CenterNet2/.github/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,68 @@

+# Contributing to detectron2
+## Issues
+We use GitHub issues to track public bugs and questions.
+Please make sure to follow one of the
+[issue templates](https://github.com/facebookresearch/detectron2/issues/new/choose)
+when reporting any issues.
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## Pull Requests
+We actively welcome pull requests.
+However, if you're adding any significant features (e.g. > 50 lines), please
+make sure to discuss with maintainers about your motivation and proposals in an issue
+before sending a PR. This is to save your time so you don't spend time on a PR that we'll not accept.
+We do not always accept new features, and we take the following
+factors into consideration:
+1. Whether the same feature can be achieved without modifying detectron2.
+   Detectron2 is designed so that you can implement many extensions from the outside, e.g.
+   those in [projects](https://github.com/facebookresearch/detectron2/tree/master/projects).
+   * If some part of detectron2 is not extensible enough, you can also bring up a more general issue to
+     improve it. Such feature request may be useful to more users.
+2. Whether the feature is potentially useful to a large audience (e.g. an impactful detection paper, a popular dataset,
+   a significant speedup, a widely useful utility),
+   or only to a small portion of users (e.g., a less-known paper, an improvement not in the object
+   detection field, a trick that's not very popular in the community, code to handle a non-standard type of data)
+   * Adoption of additional models, datasets, new task are by default not added to detectron2 before they
+     receive significant popularity in the community.
+     We sometimes accept such features in `projects/`, or as a link in `projects/README.md`.
+3. Whether the proposed solution has a good design / interface. This can be discussed in the issue prior to PRs, or
+   in the form of a draft PR.
+4. Whether the proposed solution adds extra mental/practical overhead to users who don't
+   need such feature.
+5. Whether the proposed solution breaks existing APIs.
+To add a feature to an existing function/class `Func`, there are always two approaches:
+(1) add new arguments to `Func`; (2) write a new `Func_with_new_feature`.
+To meet the above criteria, we often prefer approach (2), because:
+1. It does not involve modifying or potentially breaking existing code.
+2. It does not add overhead to users who do not need the new feature.
+3. Adding new arguments to a function/class is not scalable w.r.t. all the possible new research ideas in the future.
+When sending a PR, please do:
+1. If a PR contains multiple orthogonal changes, split it to several PRs.
+2. If you've added code that should be tested, add tests.
+3. For PRs that need experiments (e.g. adding a new model or new methods),
+   you don't need to update model zoo, but do provide experiment results in the description of the PR.
+4. If APIs are changed, update the documentation.
+5. We use the [Google style docstrings](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html) in python.
+6. Make sure your code lints with `./dev/linter.sh`.
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## License
+By contributing to detectron2, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

third_party/CenterNet2/.github/Detectron2-Logo-Horz.svg ADDED Viewed

third_party/CenterNet2/.github/ISSUE_TEMPLATE.md ADDED Viewed

	@@ -0,0 +1,5 @@

+Please select an issue template from
+https://github.com/facebookresearch/detectron2/issues/new/choose .
+Otherwise your issue will be closed.

third_party/CenterNet2/.github/ISSUE_TEMPLATE/bugs.md ADDED Viewed

	@@ -0,0 +1,38 @@

+---
+name: "🐛 Bugs"
+about: Report bugs in detectron2
+title: Please read & provide the following
+---
+## Instructions To Reproduce the 🐛 Bug:
+1. Full runnable code or full changes you made:
+```
+If making changes to the project itself, please use output of the following command:
+git rev-parse HEAD; git diff
+<put code or diff here>
+```
+2. What exact command you run:
+3. __Full logs__ or other relevant observations:
+```
+<put logs here>
+```
+4. please simplify the steps as much as possible so they do not require additional resources to
+   run, such as a private dataset.
+## Expected behavior:
+If there are no obvious error in "full logs" provided above,
+please tell us the expected behavior.
+## Environment:
+Provide your environment information using the following command:
+```
+wget -nc -q https://github.com/facebookresearch/detectron2/raw/main/detectron2/utils/collect_env.py && python collect_env.py
+```
+If your issue looks like an installation issue / environment issue,
+please first try to solve it yourself with the instructions in
+https://detectron2.readthedocs.io/tutorials/install.html#common-installation-issues

third_party/CenterNet2/.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+# require an issue template to be chosen
+blank_issues_enabled: false
+contact_links:
+  - name: How-To / All Other Questions
+    url: https://github.com/facebookresearch/detectron2/discussions
+    about: Use "github discussions" for community support on general questions that don't belong to the above issue categories
+  - name: Detectron2 Documentation
+    url: https://detectron2.readthedocs.io/index.html
+    about: Check if your question is answered in tutorials or API docs
+# Unexpected behaviors & bugs are split to two templates.
+# When they are one template, users think "it's not a bug" and don't choose the template.
+#
+# But the file name is still "unexpected-problems-bugs.md" so that old references
+# to this issue template still works.
+# It's ok since this template should be a superset of "bugs.md" (unexpected behaviors is a superset of bugs)

third_party/CenterNet2/.github/ISSUE_TEMPLATE/documentation.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+name: "\U0001F4DA Documentation Issue"
+about: Report a problem about existing documentation, comments, website or tutorials.
+labels: documentation
+---
+## 📚 Documentation Issue
+This issue category is for problems about existing documentation, not for asking how-to questions.
+* Provide a link to an existing documentation/comment/tutorial:
+* How should the above documentation/comment/tutorial improve: