File size: 6,218 Bytes
c166a5c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# --------------------------------------------------------
# SEEM -- Segment Everything Everywhere All At Once
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Xueyan Zou ([email protected]), Jianwei Yang ([email protected])
# --------------------------------------------------------
import os
import warnings
import PIL
from PIL import Image
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
import gradio as gr
import torch
import argparse
import whisper
import numpy as np
from gradio import processing_utils
from modeling.BaseModel import BaseModel
from modeling import build_model
from utils.distributed import init_distributed
from utils.arguments import load_opt_from_config_files
from utils.constants import COCO_PANOPTIC_CLASSES
from demo.seem.tasks import *
def parse_option():
parser = argparse.ArgumentParser('SEEM Demo', add_help=False)
parser.add_argument('--conf_files', default="configs/seem/focall_unicl_lang_demo.yaml", metavar="FILE", help='path to config file', )
cfg = parser.parse_args()
return cfg
'''
build args
'''
cfg = parse_option()
opt = load_opt_from_config_files([cfg.conf_files])
opt = init_distributed(opt)
# META DATA
cur_model = 'None'
if 'focalt' in cfg.conf_files:
pretrained_pth = os.path.join("seem_focalt_v0.pt")
if not os.path.exists(pretrained_pth):
os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focalt_v0.pt"))
cur_model = 'Focal-T'
elif 'focal' in cfg.conf_files:
pretrained_pth = os.path.join("seem_focall_v0.pt")
if not os.path.exists(pretrained_pth):
os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v0.pt"))
cur_model = 'Focal-L'
'''
build model
'''
model = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth).eval().cuda()
with torch.no_grad():
model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(COCO_PANOPTIC_CLASSES + ["background"], is_eval=True)
'''
audio
'''
audio = whisper.load_model("base")
@torch.no_grad()
def inference(image, task, *args, **kwargs):
with torch.autocast(device_type='cuda', dtype=torch.float16):
if 'Video' in task:
return interactive_infer_video(model, audio, image, task, *args, **kwargs)
else:
return interactive_infer_image(model, audio, image, task, *args, **kwargs)
class ImageMask(gr.components.Image):
"""
Sets: source="canvas", tool="sketch"
"""
is_template = True
def __init__(self, **kwargs):
super().__init__(source="upload", tool="sketch", interactive=True, **kwargs)
def preprocess(self, x):
return super().preprocess(x)
class Video(gr.components.Video):
"""
Sets: source="canvas", tool="sketch"
"""
is_template = True
def __init__(self, **kwargs):
super().__init__(source="upload", **kwargs)
def preprocess(self, x):
return super().preprocess(x)
'''
launch app
'''
title = "SEEM: Segment Everything Everywhere All At Once"
description = """
<div style="text-align: center; font-weight: bold;">
<span style="font-size: 18px" id="paper-info">
[<a href="https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once" target="_blank">GitHub</a>]
[<a href="https://arxiv.org/pdf/2304.06718.pdf" target="_blank">arXiv</a>]
</span>
</div>
<div style="text-align: left; font-weight: bold;">
<br>
🌪 Note: The current model is run on <span style="color:blue;">SEEM {}</span>, for <span style="color:blue;">best performance</span> refer to <a href="https://huggingface.co/spaces/xdecoder/SEEM" target="_blank"><span style="color:red;">our demo</span></a>.
</p>
</div>
""".format(cur_model)
'''Usage
Instructions:
🎈 Try our default examples first (Sketch is not automatically drawed on input and example image);
🎈 For video demo, it takes about 30-60s to process, please refresh if you meet an error on uploading;
🎈 Upload an image/video (If you want to use referred region of another image please check "Example" and upload another image in referring image panel);
🎈 Select at least one type of prompt of your choice (If you want to use referred region of another image please check "Example");
🎈 Remember to provide the actual prompt for each promt type you select, otherwise you will meet an error (e.g., rember to draw on the referring image);
🎈 Our model by default support the vocabulary of COCO 133 categories, others will be classified to 'others' or misclassifed.
'''
article = "The Demo is Run on SEEM-Tiny."
inputs = [ImageMask(label="[Stroke] Draw on Image",type="pil"), gr.inputs.CheckboxGroup(choices=["Stroke", "Example", "Text", "Audio", "Video", "Panoptic"], type="value", label="Interative Mode"), ImageMask(label="[Example] Draw on Referring Image",type="pil"), gr.Textbox(label="[Text] Referring Text"), gr.Audio(label="[Audio] Referring Audio", source="microphone", type="filepath"), gr.Video(label="[Video] Referring Video Segmentation",format="mp4",interactive=True)]
gr.Interface(
fn=inference,
inputs=inputs,
outputs=[
gr.outputs.Image(
type="pil",
label="Segmentation Results (COCO classes as label)"),
gr.Video(
label="Video Segmentation Results (COCO classes as label)", format="mp4"
),
],
examples=[
["demo/seem/examples/corgi1.webp", ["Text"], "demo/seem/examples/corgi2.jpg", "The corgi.", None, None],
["demo/seem/examples/river1.png", ["Text", "Audio"], "demo/seem/examples/river2.png", "The green trees.", "demo/seem/examples/river1.wav", None],
["demo/seem/examples/zebras1.jpg", ["Example"], "demo/seem/examples/zebras2.jpg", "", None, None],
["demo/seem/examples/fries1.png", ["Example"], "demo/seem/examples/fries2.png", "", None, None],
["demo/seem/examples/placeholder.png", ["Video"], "demo/seem/examples/ref_vase.JPG", "", None, "demo/seem/examples/vasedeck.mp4"],
],
title=title,
description=description,
article=article,
allow_flagging='never',
cache_examples=False,
).launch(share=True) |