Spaces:
Runtime error
Runtime error
File size: 5,496 Bytes
372785b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# Adopted from https://github.com/magic-research/Sa2VA/blob/main/projects/llava_sam2/models/sam2.py.
# Below is the original copyright:
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path
import torch
import torch.nn as nn
from hydra import compose
from hydra.utils import instantiate
from omegaconf import OmegaConf
from .utils import load_checkpoint_with_prefix, load_state_dict_to_model
class SAM2(nn.Module):
def __init__(
self,
cfg_path: str = "sam2_hiera_l.yaml",
ckpt_path: str = "sam2_hiera_large.pt",
hydra_overrides_extra=None,
apply_postprocessing=True,
):
super().__init__()
import third_parts.sam2 # noqa: F401
if hydra_overrides_extra is None:
hydra_overrides_extra = []
hydra_overrides = [
## Extension: LLM prompt
"++model._target_=rynnec.model.predictor.SAM2VideoPredictor",
]
if apply_postprocessing:
hydra_overrides_extra = hydra_overrides_extra.copy()
hydra_overrides_extra += [
# dynamically fall back to multi-mask if the single mask is not stable
"++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
"++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
"++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
# the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
# "++model.binarize_mask_from_pts_for_mem_enc=true",
# fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
# "++model.fill_hole_area=8",
]
hydra_overrides.extend(hydra_overrides_extra)
# Read config and init model
cfg = compose(config_name=cfg_path, overrides=hydra_overrides)
OmegaConf.resolve(cfg)
sam2_model = instantiate(cfg.model, _recursive_=True)
state_dict = load_checkpoint_with_prefix(ckpt_path)
load_state_dict_to_model(sam2_model, state_dict)
self.sam2_model = sam2_model
self.hidden_dim = self.sam2_model.hidden_dim
self.img_mean = (0.485, 0.456, 0.406)
self.img_std = (0.229, 0.224, 0.225)
def inject_language_embd(self, inference_state, language_embd):
num_frame = len(language_embd)
num_obj = len(language_embd[0])
mask_out = []
for frame_idx in range(num_frame):
frame_mask_out = []
for obj_idx in range(num_obj):
_language_embd = language_embd[frame_idx][obj_idx][None][None]
_, _, out_mask_logits = self.sam2_model.add_language_embd(inference_state, frame_idx, obj_idx + 100, _language_embd)
frame_mask_out.append(out_mask_logits)
frame_mask_out = torch.cat(frame_mask_out, dim=1)
mask_out.append(frame_mask_out)
mask_out = torch.cat(mask_out, dim=0)
return mask_out
def language_embd_inference(self, inference_state, language_embd):
num_frame = len(language_embd)
num_obj = len(language_embd[0])
mask_out = []
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
for frame_idx in range(num_frame):
frame_mask_out = []
for obj_idx in range(num_obj):
_language_embd = language_embd[frame_idx][obj_idx][None][None]
_, _, out_mask_logits = self.sam2_model.add_language_embd(
inference_state,
frame_idx,
obj_idx + 100,
_language_embd,
inference=True,
)
frame_mask_out.append(out_mask_logits)
frame_mask_out = torch.cat(frame_mask_out, dim=1)
mask_out.append(frame_mask_out)
mask_out = []
for out_frame_idx, out_obj_ids, out_mask_logits in self.sam2_model.propagate_in_video(inference_state):
mask_out.append(out_mask_logits)
mask_out = torch.cat(mask_out, dim=0)
return mask_out
def get_sam2_embeddings(self, images):
return self.sam2_model.init_state(images)
def forward(self, batch):
raise NotImplementedError
def preprocess_image(self, image: torch.Tensor, dtype=torch.float32) -> torch.Tensor:
image = image / 255.
img_mean = torch.tensor(self.img_mean, dtype=dtype, device=image.device)[:, None, None]
img_std = torch.tensor(self.img_std, dtype=dtype, device=image.device)[:, None, None]
image -= img_mean
image /= img_std
return image
|