Spaces:

sky24h
/

Training-Free_Zero-Shot_Semantic_Segmentation_with_LLM_Refinement

Running on Zero

App Files Files Community

sky24h commited on Jul 30, 2024

Commit

a9d25c7

0 Parent(s):

gradio demo for ZeroGPU, HF

Browse files

Files changed (32) hide show

.gitattributes +35 -0
.gitignore +171 -0
README.md +13 -0
app.py +121 -0
configs/COCO-81.yaml +3 -0
configs/Cityscapes.yaml +3 -0
configs/DRAM.yaml +3 -0
configs/VOC2012.yaml +2 -0
examples/COCO-81_eg.jpg +0 -0
examples/Cityscapes_eg.jpg +0 -0
examples/DRAM_eg.jpg +0 -0
examples/VOC2012_eg.jpg +0 -0
gradio_cached_examples/16/log.csv +5 -0
gradio_cached_examples/16/output/6b3896574851c5665d17/image.webp +0 -0
gradio_cached_examples/16/output/8d66e32b3b15feb7ecc9/image.webp +0 -0
gradio_cached_examples/16/output/ca6408b417ed4de51f74/image.webp +0 -0
gradio_cached_examples/16/output/e9085a590715dd9a4cbc/image.webp +0 -0
pre-requirements.txt +19 -0
pretrained-models/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py +43 -0
pretrained-models/checkpoints/groundingdino_swint_ogc.pth +3 -0
pretrained-models/checkpoints/put pre-trained checkpoints here.txt +0 -0
pretrained-models/checkpoints/ram_plus_swin_large_14m.pth +3 -0
pretrained-models/checkpoints/sam_hq_vit_l.pth +3 -0
requirements.txt +3 -0
utils/Arial.ttf +0 -0
utils/blip2_utils.py +40 -0
utils/env_utils.py +56 -0
utils/grounded_sam_utils.py +348 -0
utils/labels_utils.py +214 -0
utils/llms_utils.py +233 -0
utils/ram_utils.py +86 -0
utils/timer_utils.py +89 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+*.pth
+*.bin
+*.log
+*.safetensors
+outputs/
+outputs_single/
+results/
+pretrained-models/checkpoints/
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Training-Free Zero-Shot Semantic Segmentation With LLM Refinement
+emoji: ⚡
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 4.38.1
+app_file: app.py
+pinned: false
+license: agpl-3.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+import cv2
+import spaces
+import gradio as gr
+from PIL import Image
+from omegaconf import OmegaConf
+# set up environment
+from utils.env_utils import set_random_seed, use_lower_vram
+from utils.timer_utils import Timer
+set_random_seed(1024)
+timer = Timer()
+timer.start()
+# use_lower_vram()
+# import functions
+from utils.labels_utils import Labels
+from utils.ram_utils import ram_inference
+from utils.blip2_utils import blip2_caption
+from utils.llms_utils import pre_refinement, make_prompt, init_model
+from utils.grounded_sam_utils import run_grounded_sam
+# hardcode parameters for G-SAM
+box_threshold  = 0.18
+text_threshold = 0.15
+iou_threshold  = 0.8
+global current_config, L, llm, system_prompt
+# load Llama-3 here to avoid loading it during the inference.
+llm = init_model("Meta-Llama-3-8B-Instruct")
+current_config = ""
+L = None
+system_prompt = None
+def load_config(config_type):
+    config = OmegaConf.load(os.path.join(os.path.dirname(__file__), f"configs/{config_type}.yaml"))
+    L = Labels(config=config)
+    # init labels and llm prompt, only Meta-Llama-3-8B-Instruct is supported for online demo, but you can use any model in your local environment using our released code
+    system_prompt = make_prompt(", ".join(L.LABELS))
+    return L, system_prompt
+@spaces.GPU(duration=120)
+def process(image_ori, config_type):
+    global current_config, L, llm, system_prompt
+    if current_config != config_type:
+        L, system_prompt = load_config(config_type)
+        current_config = config_type
+    else:
+        pass
+    image_ori = cv2.cvtColor(image_ori, cv2.COLOR_BGR2RGB)
+    image_pil = Image.fromarray(image_ori)
+    labels_ram = ram_inference(image_pil) + ": " + blip2_caption(image_pil)
+    converted_labels, llm_output = pre_refinement([labels_ram], system_prompt, llm=llm)
+    labels_llm = L.check_labels(converted_labels)[0]
+    print("labels_ram: ", labels_ram)
+    print("llm_output: ", llm_output)
+    print("labels_llm: ", labels_llm)
+    # run sam
+    label_res, bboxes, output_labels, output_prob_maps, output_points = run_grounded_sam(
+        input_image    = {"image": image_pil, "mask": None},
+        text_prompt    = labels_llm,
+        box_threshold  = box_threshold,
+        text_threshold = text_threshold,
+        iou_threshold  = iou_threshold,
+        LABELS         = L.LABELS,
+        IDS            = L.IDS,
+        llm            = llm,
+        timer          = timer,
+    )
+    # draw mask and save image
+    ours = L.draw_mask(label_res, image_ori, print_label=True, tag="Ours")
+    return cv2.cvtColor(ours, cv2.COLOR_BGR2RGB)
+if __name__ == "__main__":
+    # options for different settings
+    dropdown_options = ["COCO-81", "Cityscapes", "DRAM", "VOC2012"]
+    default_option = "COCO-81"
+with gr.Blocks() as demo:
+    gr.HTML(
+        """
+            <h1 style="text-align: center; font-size: 32px; font-family: 'Times New Roman', Times, serif;">
+                Training-Free Zero-Shot Semantic Segmentation with LLM Refinement
+            </h1>
+            <p style="text-align: center; font-size: 20px; font-family: 'Times New Roman', Times, serif;">
+                <a style="text-align: center; display:inline-block"
+                    href="https://sky24h.github.io/websites/bmvc2024_training-free-semseg-with-LLM/">
+                    <img src="https://huggingface.co/datasets/huggingface/badges/raw/main/paper-page-sm.svg#center"
+                    alt="Paper Page">
+                </a>
+                <a style="text-align: center; display:inline-block" href="https://huggingface.co/spaces/sky24h/Training-Free_Zero-Shot_Semantic_Segmentation_with_LLM_Refinement?duplicate=true">
+                    <img src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm.svg#center" alt="Duplicate Space">
+                </a>
+            </p>
+            """
+    )
+    gr.Interface(
+        fn=process,
+        inputs=[gr.Image(type="numpy", height="384"), gr.Dropdown(choices=dropdown_options, label="Refinement Type", value=default_option)],
+        outputs="image",
+        description="""<html>
+        <p style="text-align:center;"> This is an online demo for the paper "Training-Free Zero-Shot Semantic Segmentation with LLM Refinement" (BMVC 2024). </p>
+        <p style="text-align:center;"> Uasge: Please select or upload an image and choose a dataset setting for semantic segmentation refinement.</p>
+        </html>""",
+        allow_flagging='never',
+        examples=[
+            ["examples/Cityscapes_eg.png", "Cityscapes"],
+            ["examples/DRAM_eg.jpg", "DRAM"],
+            ["examples/COCO-81_eg.jpg", "COCO-81"],
+            ["examples/VOC2012_eg.jpg", "VOC2012"],
+        ],
+        cache_examples=True,
+    )
+    demo.queue(max_size=10).launch()

configs/COCO-81.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+Name: COCO-81
+label_list: "unlabeled, person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic-light, fire-hydrant, stop-sign, parking-meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag, tie, suitcase, frisbee, skis, snowboard, sports-ball, kite, baseball-bat, baseball-glove, skateboard, surfboard, tennis-racket, bottle, wine-glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, hot-dog, pizza, donut, cake, chair, couch, potted-plant, bed, dining-table, toilet, tv, laptop, mouse, remote, keyboard, cell-phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy-bear, hair-drier, toothbrush"
+mask_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]

configs/Cityscapes.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+Name: Cityscapes
+label_list: "background, road, sidewalk, building, wall, fence, pole, traffic-light, traffic-sign, tree, terrain, sky, person, rider, car, truck, bus, train, motorcycle, bicycle"
+mask_ids: [0, 7, 8, 11, 12, 13, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33]

configs/DRAM.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+Name: DRAM
+label_list: "background, bird, boat, bottle, cat, chair, cow, dog, horse, person, potted-plant, sheep"
+mask_ids: [0, 3, 4, 5, 8, 9, 10, 12, 13, 15, 16, 17]

configs/VOC2012.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Name: VOC2012
2	+ label_list: "background, aeroplane, bicycle, bird, boat, bottle, bus, car, cat, chair, cow, dining-table, dog, horse, motorbike, person, potted-plant, sheep, sofa, train, monitor, void"

examples/COCO-81_eg.jpg ADDED Viewed

examples/Cityscapes_eg.jpg ADDED Viewed

examples/DRAM_eg.jpg ADDED Viewed

examples/VOC2012_eg.jpg ADDED Viewed

gradio_cached_examples/16/log.csv ADDED Viewed

	@@ -0,0 +1,5 @@

+output,flag,username,timestamp
+"{""path"": ""gradio_cached_examples/16/output/e9085a590715dd9a4cbc/image.webp"", ""url"": ""/file=/tmp/gradio/f17a9230acfa1f7c9d09b85c0c0528e64c5a19ec/image.webp"", ""size"": null, ""orig_name"": ""image.webp"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2024-07-30 06:54:26.975686
+"{""path"": ""gradio_cached_examples/16/output/ca6408b417ed4de51f74/image.webp"", ""url"": ""/file=/tmp/gradio/28f694172a8e086c7d12474e78b1e36453357589/image.webp"", ""size"": null, ""orig_name"": ""image.webp"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2024-07-30 06:54:28.056743
+"{""path"": ""gradio_cached_examples/16/output/6b3896574851c5665d17/image.webp"", ""url"": ""/file=/tmp/gradio/f2a070d0cd932cc4bf9ddb7f4cca22c01d4d4e37/image.webp"", ""size"": null, ""orig_name"": ""image.webp"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2024-07-30 06:54:30.868722
+"{""path"": ""gradio_cached_examples/16/output/8d66e32b3b15feb7ecc9/image.webp"", ""url"": ""/file=/tmp/gradio/9318730ca69938781665675ccbe76d635bc47a2d/image.webp"", ""size"": null, ""orig_name"": ""image.webp"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}",,,2024-07-30 06:54:32.230374

gradio_cached_examples/16/output/6b3896574851c5665d17/image.webp ADDED Viewed

gradio_cached_examples/16/output/8d66e32b3b15feb7ecc9/image.webp ADDED Viewed

gradio_cached_examples/16/output/ca6408b417ed4de51f74/image.webp ADDED Viewed

gradio_cached_examples/16/output/e9085a590715dd9a4cbc/image.webp ADDED Viewed

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+# Install transformers and timm in second stage to avoid error
+torch==2.3.1 #pip install torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu121
+torchvision==0.18.1
+setuptools==69.5.1
+gradio==4.38.1
+openai>=1.0.0
+opencv_python==4.8.1.78
+diffusers[torch]==0.29.2
+termcolor
+fairscale
+natsort
+omegaconf
+pycocotools
+matplotlib
+onnxruntime
+onnx
+groundingdino-py
+segment_anything@git+https://github.com/SysCV/sam-hq.git
+ram@git+https://github.com/xinyu1205/recognize-anything.git

pretrained-models/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py ADDED Viewed

	@@ -0,0 +1,43 @@

+batch_size = 1
+modelname = "groundingdino"
+backbone = "swin_T_224_1k"
+position_embedding = "sine"
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+two_stage_type = "standard"
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+transformer_activation = "relu"
+dec_pred_bbox_embed_share = True
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef = 1.0
+dn_bbox_coef = 1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+max_text_len = 256
+text_encoder_type = "bert-base-uncased"
+use_text_enhancer = True
+use_fusion_layer = True
+use_checkpoint = True
+use_transformer_ckpt = True
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+sub_sentence_present = True

pretrained-models/checkpoints/groundingdino_swint_ogc.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b3ca2563c77c69f651d7bd133e97139c186df06231157a64c507099c52bc799
+size 693997677

pretrained-models/checkpoints/put pre-trained checkpoints here.txt ADDED Viewed

File without changes

pretrained-models/checkpoints/ram_plus_swin_large_14m.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:497c178836ba66698ca226c7895317e6e800034be986452dbd2593298d50e87d
+size 3010210801

pretrained-models/checkpoints/sam_hq_vit_l.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1a6c385d62bf005ded91a54d5ec55c985cfc4103ef89c08d90f39f04934c343
+size 1254865805

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+# Somehow, we needed to put transformers after GroundingDINO.
+transformers==4.42.4
+timm==1.0.8

utils/Arial.ttf ADDED Viewed

Binary file (276 kB). View file

utils/blip2_utils.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import torch
+from transformers import Blip2Processor, Blip2ForConditionalGeneration  # , BitsAndBytesConfig
+from .env_utils import get_device, low_vram_mode
+device = get_device()
+blip2_model_id = "Salesforce/blip2-opt-2.7b"  # or replace with your local model path
+blip2_precision = torch.bfloat16
+# Load BLIP2 model and processor from HuggingFace
+blip2_processor = Blip2Processor.from_pretrained(blip2_model_id)
+if low_vram_mode:
+    blip2_model = Blip2ForConditionalGeneration.from_pretrained(
+        blip2_model_id,
+        torch_dtype=blip2_precision,
+        device_map=device,
+        # quantization_config = BitsAndBytesConfig(load_in_8bit=True) if low_vram_mode else None,        # ZeroGPU does not support quantization.
+    ).eval()
+else:
+    blip2_model = Blip2ForConditionalGeneration.from_pretrained(blip2_model_id, torch_dtype=blip2_precision, device_map=device).eval()
+def blip2_caption(raw_image):
+    # unconditional image captioning
+    inputs  = blip2_processor(raw_image, return_tensors="pt")
+    inputs  = inputs.to(device=device, dtype=blip2_precision)
+    out     = blip2_model.generate(**inputs)
+    caption = blip2_processor.decode(out[0], skip_special_tokens=True)
+    return caption
+# if __name__ == "__main__":
+#     from PIL import Image
+#     # Test the RAM++ model
+#     image_path = os.path.join(os.path.dirname(__file__), "../sources/test_imgs/1.jpg")
+#     image = Image.open(image_path)
+#     result = blip2_caption(image)
+#     print(result)

utils/env_utils.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Avoid multiple imports of the same module. Use this to import the module only once.
+# Also, ensure that the device and pretrained models folder are consistent across the project.
+import os
+import torch
+global low_vram_mode
+low_vram_mode = False
+def use_lower_vram():
+    global low_vram_mode
+    low_vram_mode = True
+def get_device():
+    device = torch.device("cuda") # must use GPU in online demo version
+    return device
+def set_random_seed(seed: int):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def get_pretrained_models_folder():
+    return os.path.join(os.path.dirname(__file__), "../pretrained-models")
+# def download_pretrained_models():
+#     pretrained_models_folder = get_pretrained_models_folder()
+#     # hard-coded download links
+#     groundingdino_link = "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth"
+#     sam_link           = "https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_l.pth"
+#     ram_link           = "https://huggingface.co/xinyu1205/recognize-anything-plus-model/resolve/main/ram_plus_swin_large_14m.pth"
+#     groundingdino_ckpt = os.path.join(pretrained_models_folder, "checkpoints/groundingdino_swint_ogc.pth")
+#     sam_ckpt           = os.path.join(pretrained_models_folder, "checkpoints/sam_hq_vit_l.pth")
+#     ram_ckpt           = os.path.join(pretrained_models_folder, "checkpoints/ram_plus_swin_large_14m.pth")
+#     # download pretrained models if not exists
+#     if not os.path.exists(groundingdino_ckpt):
+#         print(f"Downloading pretrained model: {groundingdino_ckpt}")
+#         os.system(f"wget -O {groundingdino_ckpt} {groundingdino_link} -q")
+#     if not os.path.exists(sam_ckpt):
+#         print(f"Downloading pretrained model: {sam_ckpt}")
+#         os.system(f"wget -O {sam_ckpt} {sam_link} -q")
+#     if not os.path.exists(ram_ckpt):
+#         print(f"Downloading pretrained model: {ram_ckpt}")
+#         os.system(f"wget -O {ram_ckpt} {ram_link} -q")
+# # download pretrained models when imported
+# download_pretrained_models()

utils/grounded_sam_utils.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import os
+import cv2
+import torch
+import torchvision
+import numpy as np
+from PIL import Image, ImageFont
+import traceback
+# environment variables and paths
+from .env_utils import get_device, get_pretrained_models_folder, low_vram_mode
+device = get_device()
+pretrained_models_folder = get_pretrained_models_folder()
+groundingdino_ckpt = os.path.join(pretrained_models_folder, "checkpoints/groundingdino_swint_ogc.pth")
+sam_ckpt = os.path.join(pretrained_models_folder, "checkpoints/sam_hq_vit_l.pth")
+# segment anything
+from segment_anything import build_sam_vit_l, SamPredictor
+# Grounding DINO
+from groundingdino.models import build_model
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+import groundingdino.datasets.transforms as T
+font_family = os.path.join(os.path.dirname(__file__), "Arial.ttf")
+font_size = 24
+font = ImageFont.truetype(font_family, font_size)
+from .llms_utils import post_refinement
+def draw_bboxes(ours_bboxes, output_labels, bboxes, output_points, output_prob_maps):
+    # draw bboxes on the image
+    for label, bbox in zip(output_labels, bboxes):
+        bbox = bbox.cpu().numpy()
+        bbox = [int(round(bbox[0])), int(round(bbox[1])), int(round(bbox[2])), int(round(bbox[3]))]
+        # print("label, bbox", label, bbox)
+        cv2.rectangle(ours_bboxes, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
+        # caption inside the bbox, below the top left corner 20 pixels
+        cv2.putText(ours_bboxes, label, (bbox[0], bbox[1] + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
+    try:
+        for points in output_points:
+            for point in points:
+                # draw a cross on the point
+                cv2.drawMarker(ours_bboxes, (int(point[0]), int(point[1])), (0, 0, 255), cv2.MARKER_CROSS, 10, 2)
+    except: # noqa
+        pass
+    # Draw the probability maps
+    # if output_prob_maps is not None:
+    #     output_prob_maps = np.concatenate(output_prob_maps, axis=1)
+    #     ours_bboxes = np.concatenate([output_prob_maps, ours_bboxes], axis=1)
+    return ours_bboxes
+def transform_image(image_pil):
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image
+def _load_model(model_config_path, model_checkpoint_path, device):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = device
+    model = build_model(args)
+    model.load_state_dict(clean_state_dict(torch.load(model_checkpoint_path, map_location="cpu")["model"]), strict=False)
+    return model.to(device=device).eval()
+def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True):
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
+    boxes  = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
+    logits.shape[0]
+    # filter output
+    logits_filt = logits.clone()
+    boxes_filt  = boxes.clone()
+    filt_mask   = logits_filt.max(dim=1)[0] > box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt  = boxes_filt[filt_mask]  # num_filt,  4
+    logits_filt.shape[0]
+    # get phrase
+    tokenlizer = model.tokenizer
+    tokenized  = tokenlizer(caption)
+    # build pred
+    pred_phrases = []
+    scores = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+        if with_logits:
+            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+        else:
+            pred_phrases.append(pred_phrase)
+        scores.append(logit.max().item())
+    return boxes_filt, torch.Tensor(scores), pred_phrases
+def postprocess_masks(input_masks, input_pred_phrases):
+    input_masks_ = input_masks.cpu().numpy().transpose(0, 2, 3, 1).copy()
+    output_masks = input_masks.cpu().numpy().transpose(0, 2, 3, 1).copy()
+    for i in range(len(output_masks)):
+        for j in range(len(output_masks)):
+            if i == j:
+                continue
+            if ((input_masks_[i] * input_masks_[j]).sum() > 0) and (input_pred_phrases[i].split("(")[0] != input_pred_phrases[j].split("(")[0]):
+                # if two masks overlap and have different labels
+                if float(input_pred_phrases[i].split("(")[1].split(")")[0]) < float(input_pred_phrases[j].split("(")[1].split(")")[0]):
+                    # if the score of the first mask is lower than the second mask, remove overlapping area from the first mask
+                    output_masks[i] = np.logical_and(output_masks[i], np.logical_not(input_masks_[j]))
+                else:
+                    # otherwise, remove overlapping area from the second mask
+                    output_masks[j] = np.logical_and(output_masks[j], np.logical_not(input_masks_[i]))
+    return output_masks.transpose(3, 0, 1, 2)[0]
+groundingdino_model = None
+sam_predictor = None
+already_converted = {}
+config_file = os.path.join(pretrained_models_folder, "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
+def _find_higest_points(logits_map, num_top_points=20):
+    if num_top_points == 0:
+        return logits_map, []
+    # find the highest points on the logits map
+    gray = cv2.cvtColor(logits_map, cv2.COLOR_BGR2GRAY).astype("uint8")
+    # find the highest points
+    points = []
+    for i in range(num_top_points):
+        y, x = np.unravel_index(np.argmax(gray, axis=None), gray.shape)
+        points.append((x, y))
+        gray[y, x] = 0
+    # draw points
+    for point in points:
+        cv2.drawMarker(logits_map, point, (0, 0, 255), cv2.MARKER_CROSS, 10, 3)
+    return logits_map, points
+def _find_contour_points(logits_map, num_points=5):
+    if num_points == 0:
+        return logits_map, []
+    # find contours and get number of points on the contour, then draw the points on the image
+    gray = cv2.cvtColor(logits_map, cv2.COLOR_BGR2GRAY).astype("uint8")
+    ret, thresh = cv2.threshold(gray, 155, 255, 0)
+    # erode to make the contour thinner
+    kernel = np.ones((13, 13), np.uint8)
+    # only apply erode when the image is large enough, otherwise, skip it
+    if np.sum(thresh) > (gray.shape[0] * gray.shape[1] * 255 * 0.1):
+        erode_iterations = int(np.log2(min(gray.shape[0], gray.shape[1])) - 1)
+        thresh = cv2.erode(thresh, kernel, iterations=erode_iterations)
+    # only use the largest contour
+    contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+    largest_contour = max(contours, key=cv2.contourArea)
+    points = []
+    if len(largest_contour) > num_points:
+        for i in range(0, len(largest_contour), len(largest_contour) // num_points):
+            if len(points) == num_points:
+                break
+            x, y = largest_contour[i][0]
+            points.append((x, y))
+    # make sure the points are at the same number as num_points
+    if len(points) == 0:
+        raise ValueError("no points found")
+    elif len(points) < num_points:
+        for i in range(num_points - len(points)):
+            points.append(points[-1])
+    elif len(points) > num_points:
+        points = points[:num_points]
+    else:
+        pass
+    # draw points
+    for point in points:
+        # cv2.circle(logits_map, point, 3, (0, 0, 255), -1)
+        cv2.drawMarker(logits_map, point, (0, 0, 255), cv2.MARKER_CROSS, 10, 3)
+    return logits_map, points
+def _process_logits(logits, pred_phrases, top_n_points):
+    # print("logits", logits.shape)
+    # torch.Size([3, 1, 468, 500])
+    logits = logits.cpu().numpy()[:, 0, :, :]
+    logits = ((logits - np.min(logits)) / (np.max(logits) - np.min(logits))) * 255
+    logits_maps = []
+    points_list = []
+    for i, logits_map in enumerate(logits):
+        try:
+            logits_map = cv2.cvtColor(np.array(logits_map, dtype=np.uint8), cv2.COLOR_GRAY2BGR)
+            logits_map, points = _find_higest_points(logits_map, num_top_points=top_n_points)
+            if len(points) == 0:
+                points = None
+            cv2.putText(logits_map, pred_phrases[i], (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+            logits_maps.append(logits_map)
+            points_list.append(points)
+        except Exception as e:
+            print("error in _process_logits", e)
+            continue
+    return logits_maps, points_list
+def run_grounded_sam(
+    input_image,
+    text_prompt,
+    box_threshold,
+    text_threshold,
+    iou_threshold,
+    LABELS = [],
+    IDS    = [],
+    llm    = None,
+    timer  = None,
+    # for ablation study
+    wo_post      = False,
+    top_n_points = 20,
+):
+    global groundingdino_model, sam_predictor, already_converted
+    # load image
+    image_pil = input_image["image"].convert("RGB")
+    transformed_image = transform_image(image_pil).to(device=device)
+    size = image_pil.size
+    if groundingdino_model is None:
+        groundingdino_model = _load_model(config_file, groundingdino_ckpt, device=device)
+    # run grounding dino model
+    boxes_filt, scores, pred_phrases = get_grounding_output(groundingdino_model, transformed_image, text_prompt, box_threshold, text_threshold)
+    timer.check("get_grounding_output")
+    # process boxes
+    H, W = size[1], size[0]
+    for i in range(boxes_filt.size(0)):
+        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+        boxes_filt[i][2:] += boxes_filt[i][:2]
+    boxes_filt = boxes_filt.cpu()
+    # nms
+    nms_idx      = torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist()
+    boxes_filt   = boxes_filt[nms_idx]
+    pred_phrases = [pred_phrases[idx] for idx in nms_idx]
+    if sam_predictor is None:
+        # initialize SAM
+        assert sam_ckpt, "sam_ckpt is not found!"
+        sam = build_sam_vit_l(checkpoint=sam_ckpt)
+        sam.to(device=device).eval()
+        sam_predictor = SamPredictor(sam)
+    sam_predictor.model.to(device=device)
+    image = np.array(image_pil)
+    sam_predictor.set_image(image)
+    input_box = torch.tensor(boxes_filt, device=device)
+    transformed_boxes = sam_predictor.transform.apply_boxes_torch(input_box, image.shape[:2])
+    logits, _, _ = sam_predictor.predict_torch(
+        point_coords     = None,
+        point_labels     = None,
+        boxes            = transformed_boxes,
+        multimask_output = False,
+        return_logits    = True,
+        hq_token_only    = False,
+    )
+    timer.check("get prob")
+    output_prob_maps, output_points = _process_logits(logits, pred_phrases, top_n_points=top_n_points)
+    if top_n_points == 0:
+        # processing without points prompt, for ablation study
+        print("processing without points prompt, for ablation study")
+        point_coords = None
+        point_labels = None
+    else:
+        if None in output_points:
+            point_coords = None
+            point_labels = None
+        else:
+            point_coords = torch.tensor(np.array(output_points), device=device)
+            point_coords = sam_predictor.transform.apply_coords_torch(point_coords, image.shape[:2])
+            point_labels = torch.ones(point_coords.shape[:2], device=device)
+            # print("point_coords", point_coords.shape, point_labels.shape, transformed_boxes.shape)
+            transformed_boxes = transformed_boxes[: point_coords.shape[0]]
+    masks, _, _ = sam_predictor.predict_torch(
+        point_coords     = point_coords,
+        point_labels     = point_labels,
+        boxes            = transformed_boxes,
+        multimask_output = False,
+        hq_token_only    = False,
+    )
+    masks = postprocess_masks(masks, pred_phrases)
+    timer.check("postprocess_masks")
+    label_image = Image.new("L", size, color=0)
+    label_draw = np.array(label_image)
+    output_labels = []
+    for mask, pred_phrase in zip(masks, pred_phrases):
+        try:
+            label = pred_phrase.split("(")[0]
+            if label in ["", " "]:
+                # skip empty label
+                continue
+            elif label in LABELS:
+                # no need to convert if it's one of the target labels
+                post_label = label
+            elif label in already_converted:
+                # check if the label was converted before to save time and model calls
+                post_label = already_converted[label]
+                print("already converted: {} to {}".format(label, already_converted[label]))
+            else:
+                # convert the label using llm model
+                label = label.replace(" ", "") if "-" in label else label
+                if wo_post:
+                    print("wo_post is True, for ablation study")
+                    # skip post refinement, for ablation study
+                    post_label = label
+                else:
+                    post_label = post_refinement(LABELS, label, llm=llm)
+                print("convert from {} to {}".format(label, post_label))
+                # add to the already_converted list, no matter it's in the list or not to save $!
+                already_converted.update({label: post_label})
+                if post_label not in LABELS:
+                    raise ValueError("label not found, {} from {}".format(post_label, label))
+            output_labels.append(post_label)
+            label_index = LABELS.index(post_label)
+            label_draw[mask] = IDS[label_index]
+        except ValueError as e:
+            print("e", e)
+            print("label not found: ", pred_phrase)
+            traceback.print_exc()
+            continue
+    timer.check("llm+draw label")
+    return label_draw, boxes_filt, output_labels, output_prob_maps, output_points

utils/labels_utils.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import cv2
+import numpy as np
+COCO_CATEGORIES = [
+    # borrowed from detectron2
+    # https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_coco_stuff_10k.py
+    {"color": [0, 0, 0], "isthing": 0, "id": 0, "name": "unlabeled"},
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+]
+def create_coco_colormap(IDs):
+    all_colors = []
+    vis_colors = [category["color"] for category in COCO_CATEGORIES]
+    used_ids = [category["id"] for category in COCO_CATEGORIES]
+    all_colors = [vis_colors[used_ids.index(id)] if id in used_ids else [0, 0, 0] for id in range(max(IDs)+1)]
+    return np.array(all_colors, dtype=int)
+def create_cityscapes_colormap(IDs):
+    vis_colors = [
+        (0, 0, 0),
+        (128, 64, 128),
+        (244, 35, 232),
+        (70, 70, 70),
+        (102, 102, 156),
+        (190, 153, 153),
+        (153, 153, 153),
+        (250, 170, 30),
+        (220, 220, 0),
+        (107, 142, 35),
+        (152, 251, 152),
+        (70, 130, 180),
+        (220, 20, 60),
+        (255, 0, 0),
+        (0, 0, 142),
+        (0, 0, 70),
+        (0, 60, 100),
+        (0, 80, 100),
+        (0, 0, 230),
+        (119, 11, 32),
+    ]
+    all_colors = [vis_colors[IDs.index(id)] if id in IDs else [0, 0, 0] for id in range(max(IDs)+1)]
+    return np.array(all_colors, dtype=int)
+def create_pascal_label_colormap(n_labels=256):
+    def bitget(byteval, idx):
+        return ((byteval & (1 << idx)) != 0)
+    cmap = np.zeros((n_labels, 3), dtype=np.uint8)
+    for i in range(n_labels):
+        r = g = b = 0
+        c = i
+        for j in range(8):
+            r = r | (bitget(c, 0) << 7-j)
+            g = g | (bitget(c, 1) << 7-j)
+            b = b | (bitget(c, 2) << 7-j)
+            c = c >> 3
+        cmap[i] = np.array([r, g, b])
+    return cmap
+class Labels:
+    def __init__(self, config=None):
+        max_label_num = 200
+        if config is not None:
+            self.LABELS = config.label_list.split(", ")
+            self.IDS = config.mask_ids if hasattr(config, "mask_ids") else [i for i in range(len(self.LABELS))]
+            print("self.IDS", self.IDS)
+            if len(self.LABELS) > max_label_num:
+                raise ValueError(f"Too many labels! The maximum number of labels is {max_label_num}.")
+        else:
+            raise NotImplementedError("config is None")
+        if "COCO" in config.Name:
+            self.COLORS = create_coco_colormap(self.IDS)
+        elif "City" in config.Name:
+            self.COLORS = create_cityscapes_colormap(self.IDS)
+        else:
+            # default to pascal label colormap
+            self.COLORS = create_pascal_label_colormap()
+        assert len(self.COLORS) >= len(self.LABELS), f"len(self.COLORS)={len(self.COLORS)} < len(self.LABELS)={len(self.LABELS)}"
+    def check_labels(self, labels_list):
+        output_labels_list = []
+        for labels in labels_list:
+            output_labels = []
+            labels = labels.split(", ")
+            for label in labels:
+                if label == "background":
+                    # skip the background label
+                    continue
+                if label in self.LABELS:
+                    output_labels.append(label)
+            output_labels = list(set(output_labels))
+            output_labels_list.append(", ".join(output_labels))
+        return output_labels_list
+    def draw_mask(self, label_ori, image_ori, print_label=False, tag="", only_label=False):
+        label_ori = label_ori.astype(np.uint8)
+        label = np.zeros_like(image_ori, dtype=np.uint8)
+        # print("{}: {}".format(tag, np.unique(label_ori)))
+        for id in np.unique(label_ori):
+            # print("id", id)
+            if id == 0 or id == 255:
+                continue
+            elif id not in self.IDS:
+                print(f"Label {id} is not in the label list.")
+                continue
+            i = self.IDS.index(id)
+            center = np.mean(np.argwhere(label_ori == id), axis=0).astype(np.int64)
+            label[label_ori == id] = self.COLORS[id]
+            if print_label:
+                # add text in the center of the mask
+                cv2.putText(label, self.LABELS[i], (center[1], center[0]), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
+                # print(i, self.LABELS[i])
+        # RGB to BGR
+        label = cv2.cvtColor(label, cv2.COLOR_RGB2BGR)
+        return cv2.addWeighted(label, 0.6, image_ori, 0.4, 0) if not only_label else label
+    def find_gt_labels(self, label_gt):
+        label_gt = label_gt.astype(np.uint8)
+        label_gt_list = []
+        for id in np.unique(label_gt):
+            if id == 0 or id == 255:
+                continue
+            elif id not in self.IDS:
+                print(f"Label {id} is not in the label list.")
+                continue
+            i = self.IDS.index(id)
+            label_gt_list.append(self.LABELS[i])
+        return label_gt_list

utils/llms_utils.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import os
+import torch
+from openai import OpenAI
+from termcolor import colored
+import transformers
+# from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import login
+# environment variables and paths
+from .env_utils import get_device, low_vram_mode
+device = get_device()
+class GPT:
+    def __init__(self, model="gpt-4o-mini", api_key=None):
+        self.prices = {
+            # check at https://openai.com/api/pricing/
+            "gpt-3.5-turbo-0125": [0.0000005, 0.0000015],
+            "gpt-4o-mini"       : [0.00000015, 0.00000060],
+            "gpt-4-1106-preview": [0.00001, 0.00003],
+            "gpt-4-0125-preview": [0.00001, 0.00003],
+            "gpt-4-turbo"       : [0.00001, 0.00003],
+            "gpt-4o"            : [0.000005, 0.000015],
+        }
+        self.cheaper_model = "gpt-4o-mini"
+        assert model in self.prices.keys(), "Invalid model, please choose from: {}, or add new models in the code.".format(self.prices.keys())
+        self.model = model
+        print(f"Using {model}")
+        self.client = OpenAI(api_key=api_key)
+        self.total_cost = 0.0
+    def _update(self, response, price):
+        current_cost = response.usage.completion_tokens * price[0] + response.usage.prompt_tokens * price[1]
+        self.total_cost += current_cost
+        # print in 4 decimal places
+        print(
+            colored(
+                f"Current Tokens: {response.usage.completion_tokens + response.usage.prompt_tokens:d} \
+                Current cost: {current_cost:.4f} $, \
+                Total cost: {self.total_cost:.4f} $",
+                "yellow",
+            )
+        )
+    def chat(self, messages, temperature=0.0, max_tokens=200, post=False):
+        # set temperature to 0.0 for more deterministic results
+        if post:
+            # use cheaper model for post-refinement to save costs, since the task is simpler.
+            generated_text = self.client.chat.completions.create(
+                model=self.cheaper_model, messages=messages, temperature=temperature, max_tokens=max_tokens
+            )
+            self._update(generated_text, self.prices[self.cheaper_model])
+        else:
+            generated_text = self.client.chat.completions.create(
+                model=self.model, messages=messages, temperature=temperature, max_tokens=max_tokens
+            )
+            self._update(generated_text, self.prices[self.model])
+        generated_text = generated_text.choices[0].message.content
+        return generated_text
+class Llama3:
+    def __init__(self, model="Meta-Llama-3-8B-Instruct"):
+        login(token=os.getenv('HF_TOKEN'))
+        model = "meta-llama/{}".format(model)  # or replace with your local model path
+        print(f"Using {model}")
+        # ZeroGPU does not support quantization.
+        # tokenizer = AutoTokenizer.from_pretrained(model)
+        # if low_vram_mode:
+        #     model = AutoModelForCausalLM.from_pretrained(
+        #         model, quantization_config=BitsAndBytesConfig(load_in_8bit=True), device_map="auto"
+        #     ).eval()
+        self.pipeline = transformers.pipeline(
+            "text-generation",
+            model        = model,
+            # tokenizer    = tokenizer,
+            model_kwargs = {"torch_dtype": torch.bfloat16},
+            device_map   = "auto",
+        )
+        self.terminators = [self.pipeline.tokenizer.eos_token_id, self.pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+    def _update(self):
+        print(colored("Using Llama-3, Free", "green"))
+    def chat(self, messages, temperature=0.0, max_tokens=200, post=False):
+        prompt = self.pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        generated_text = self.pipeline(
+            prompt,
+            max_new_tokens = max_tokens,
+            eos_token_id   = self.terminators,
+            pad_token_id   = 128001,
+            do_sample      = True,
+            temperature    = max(temperature, 0.01), # 0.0 is not supported
+            top_p          = 0.9,
+        )
+        self._update()
+        generated_text = generated_text[0]["generated_text"][len(prompt) :]
+        return generated_text
+# Define the timeout handler
+def timeout_handler(signum, frame):
+    raise TimeoutError()
+def init_model(model, api_key=None):
+    if "gpt" in model:
+        return GPT(model=model, api_key=api_key)
+    elif "Llama" in model:
+        return Llama3(model=model)
+    else:
+        raise ValueError("Invalid model")
+def _generate_example_prompt(examples, llm=None):
+    # system prompt
+    system_prompt = """
+    Task Description:
+    - you will provide detailed explanations for example inputs and outputs within the context of the task.
+    Please adhere to the following rules:
+    - Exclude terms that appear in both lists.
+    - Detail the relevance of unmatched terms from input to output, focusing on indirect relationships.
+    - Identify and explain terms common to all output lists but rarely present in input lists; include these at the end of the output labeled 'Recommend Include Labels'.
+    - Each explanation should be concise, around 50 words.
+    Output Format:
+    - '1. Input... Output... Explanation... n. Input... Output... Explanation... \n Recommend Include Labels: label1, labeln, ...'
+    """
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {
+            "role": "user",
+            "content": f"Here are the input and output lists for which you need to provide detailed explanations:{examples.strip()}",
+        },
+    ]
+    generated_example = llm.chat(messages, temperature=0.0, max_tokens=1000)
+    return generated_example
+def _make_prompt(label_list, example=None):
+    Cityscape = "sidewalk" in label_list
+    if Cityscape:
+        add_text = f'contain at least {len(label_list.split(", "))} labels, '
+    else:
+        add_text = ""
+    # Task description and instructions for processing the input to generate output
+    system_prompt = f"""
+    Task Description:
+    - You will receive a list of caption tags accompanied by a caption text and must assign appropriate labels from a predefined label list: "{label_list}".
+    Instructions:
+    Step 1. Visualize the scene suggested by the input caption tags and text.
+    Step 2. Analyze each term within the overall scene to predict relevant labels from the predefined list, ensuring no term is overlooked.
+    Step 3. Now forget the input list and focus on the scene as a whole, expanding upon the labels to include any contextually relevant labels that complete the scene or setting.
+    Step 4. Compile all identified labels into a comma-separated list, adhering strictly to the specified format.
+    Contextually Relevant Tips:
+    - Equivalencies include converting "girl, man" to "person" and "flower, vase" to "potted plant", while "bicycle, motorcycle" suggest "rider".
+    - An outdoor scene may include labels like "sky", "tree", "clouds", "terrain".
+    - An urban scene may imply "bus", "bicycle", "road", "sidewalk", "building", "pole", "traffic-light", "traffic-sign".
+    Output:
+    - Do not output any explanations other than the final label list.
+    - The final output should {add_text}strictly adhere to the specified format: label1, label2, ... labeln
+    """.strip()
+    if example:
+        system_prompt += f"""
+        Additional Examples with Detailed Explanations:
+        {example}
+        """
+    print("system_prompt: ", system_prompt)
+    return system_prompt
+    # - You will receive a list of terms accompanied by a caption text and must assign appropriate labels from a predefined label list: "{label_list}".
+    # Instructions:
+    # Step 1. Visualize the scene suggested by the input list and caption text.
+def make_prompt(label_list):
+    # Create a new system prompt using the label list and the improved example prompt
+    system_prompt = _make_prompt(label_list)
+    system_prompt = {"role": "system", "content": system_prompt.strip()}
+    print("system_prompt: ", system_prompt)
+    return system_prompt
+def _call_llm(system_prompt, llm, user_input):
+    messages = [system_prompt, {"role": "user", "content": "Here are input caption tags and text: " + user_input}]
+    converted_label = llm.chat(messages=messages, temperature=0.0, max_tokens=200)
+    return converted_label
+def pre_refinement(user_input_list, system_prompt, llm=None):
+    llm_outputs = [_call_llm(system_prompt, llm, user_input) for user_input in user_input_list]
+    converted_labels = [f"{user_input_}, {converted_label}" for user_input_, converted_label in zip(user_input_list, llm_outputs)]
+    return converted_labels, llm_outputs
+def post_refinement(label_list, detected_label, llm=None):
+    system_input = f"""
+    Task Description:
+    - You will receive a specific phrase and must assign an appropriate label from the predefined label list: "{label_list}". \n \
+    Please adhere to the following rules: \n \
+    - Select and return only one relevant label from the predefined label list that corresponds to the given phrase. \n \
+    - Do not include any additional information or context beyond the label itself. \n \
+    - Format is purely the label itself, without any additional punctuation or formatting. \n \
+    """
+    system_input = {"role": "system", "content": system_input}
+    messages = [system_input, {"role": "user", "content": detected_label}]
+    if detected_label == "":
+        return ""
+    generated_label = None
+    for count in range(3):
+        generated_label = llm.chat(messages=messages, temperature=0.0 if count == 0 else 0.1 * (count), post=True)
+        if generated_label != "":
+            break
+    return generated_label
+if __name__ == "__main__":
+    # test the functions
+    llm = Llama3(model="Meta-Llama-3-8B-Instruct")
+    system_prompt = make_prompt("person, car, tree, sky, road, building, sidewalk, traffic-light, traffic-sign", llm=llm)
+    converted_labels, llm_outputs = pre_refinement(["person, car, road, traffic-light"], system_prompt, llm=llm)
+    print("converted_labels: ", converted_labels)
+    print("llm_outputs: ", llm_outputs)

utils/ram_utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import torch
+from PIL import Image
+from .env_utils import get_device, low_vram_mode
+device = get_device()
+pretrained_models_folder = os.path.join(os.path.dirname(__file__), "../pretrained-models")
+# RAM++
+from ram.models import ram_plus
+from ram import get_transform, inference_ram
+ram_ckpt = os.path.join(pretrained_models_folder, "checkpoints/ram_plus_swin_large_14m.pth")
+ram_precision = torch.bfloat16
+def ram_init():
+    image_size = 384
+    transform = get_transform(image_size=image_size)
+    #######load model#######
+    model = ram_plus(pretrained=ram_ckpt, image_size=image_size, vit="swin_l")
+    model = model.to(device=device, dtype=ram_precision)
+    model.eval()
+    print("RAM++ model loaded")
+    return model, transform
+# Initialize the model when importing the module
+ram_model, ram_transform = ram_init()
+def _inference(image_pil):
+    image = ram_transform(image_pil).unsqueeze(0)
+    image = image.to(device=device, dtype=ram_precision)
+    res = inference_ram(image, ram_model)
+    result = res[0].replace(" | ", ", ")
+    return result
+def _split_large_image(image_pil):
+    size = image_pil.size
+    print("Image size is too large, split into smaller patches")
+    # Split the image into 4 patches
+    patches = []
+    patch_size = (size[0] // 2, size[1] // 2)
+    for i in range(2):
+        for j in range(2):
+            left   = i * patch_size[0]
+            top    = j * patch_size[1]
+            right  = left + patch_size[0]
+            bottom = top + patch_size[1]
+            patch  = image_pil.crop((left, top, right, bottom))
+            patches.append(patch)
+    return patches
+def ram_inference(image_pil: Image.Image):
+    size = image_pil.size
+    if size[0] > 640 or size[1] > 640:
+        # split only once in the online demo version.
+        patches = _split_large_image(image_pil)
+        # while any(patch.size[0] > 640 or patch.size[1] > 640 for patch in patches):
+        #     patches = [_split_large_image(patch) for patch in patches]
+        #     patches = [patch for sublist in patches for patch in sublist]
+        # Inference on each patch
+        results = []
+        for patch in patches:
+            result = _inference(patch)
+            results.extend(result.split(", "))
+        results = list(set(results))
+        # Combine the results
+        final_result = ", ".join(results)
+        return final_result
+    else:
+        print("Image size is small enough for inference")
+        return _inference(image_pil)
+if __name__ == "__main__":
+    # Test the RAM++ model
+    image_path = os.path.join(os.path.dirname(__file__), "../sources/test_imgs/1.jpg")
+    image = Image.open(image_path)
+    result = ram_inference(image)
+    print(result)

utils/timer_utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import time
+import logging
+def create_logger(logger_name: str, log_file_path: os.PathLike = None):
+    """
+    Create a logger with the specified name and log file path.
+    """
+    logger = logging.getLogger(logger_name)
+    logger.propagate = False
+    logger.setLevel(logging.DEBUG)
+    assert log_file_path is not None, "log_file_path is required"
+    fh = logging.FileHandler(log_file_path)
+    fh_formatter = logging.Formatter("%(asctime)s : %(levelname)s, %(funcName)s Message: %(message)s")
+    fh.setFormatter(fh_formatter)
+    logger.addHandler(fh)
+    logger.info(f"logging start: {logger_name}")
+    return logger
+class Timer:
+    """
+    A simple timer class for measuring elapsed time.
+    """
+    def __init__(self, filename: os.PathLike = "timer_log.log", reset: bool = False):
+        """
+        Initialize the Timer object.
+        """
+        self.start_time = None
+        self.last_checkpoint = None
+        self.filename = filename
+        self.logger = create_logger("Timer", filename)
+        if reset:
+            self._reset_log_file()
+    def _reset_log_file(self):
+        """
+        Reset the log file by clearing its contents.
+        """
+        with open(self.filename, "w") as file:
+            file.write("")
+    def start(self):
+        """
+        Start the timer.
+        """
+        self.start_time = time.time()
+        self.last_checkpoint = self.start_time
+        self.logger.info("Timer started.")
+    def check(self, message):
+        """
+        Log a checkpoint with the current time and time since the last checkpoint.
+        Args:
+            message (str): The message to include in the log.
+        """
+        if self.start_time is None:
+            self.logger.warning("Timer has not been started.")
+        else:
+            log_message = (
+                f"Current time count: {time.time() - self.start_time:.4f} seconds, "
+                f"Time since last checkpoint: {time.time() - self.last_checkpoint:.4f} seconds, "
+                f"for {message}"
+            )
+            self.last_checkpoint = time.time()
+            self.logger.info(log_message)
+    def stop(self):
+        """
+        Stop the timer and log the elapsed time.
+        """
+        if self.start_time is None:
+            self.logger.warning("Timer has not been started.")
+        else:
+            self.end_time = time.time()
+            self.logger.info(f"Total elapsed time: {self.end_time - self.start_time} seconds\n")
+if __name__ == "__main__":
+    # Test the Timer class
+    timer = Timer(filename="timer_log.log", reset=True)
+    timer.start()
+    timer.check("First checkpoint")
+    time.sleep(1)
+    timer.check("Second checkpoint")
+    timer.stop()