File size: 8,131 Bytes
e8614c4
5490a38
 
 
 
 
 
 
dd429be
5490a38
e8614c4
 
dd429be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8614c4
dd429be
 
 
 
 
 
e8614c4
 
 
 
 
 
5490a38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8614c4
 
 
5490a38
 
 
 
 
 
 
 
 
 
e8614c4
 
 
 
 
 
 
 
5490a38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8614c4
 
 
 
 
5490a38
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/usr/bin/env python3
# --------------------------------------------------------
# SEEM -- Segment Everything Everywhere All At Once
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Xueyan Zou ([email protected]), Jianwei Yang ([email protected])
# --------------------------------------------------------

# Install dependencies and patch files before any imports
import os
import sys
import subprocess

print("Setting up SEEM environment...")

# Create a custom distributed.py file that doesn't need mpi4py
os.makedirs('utils', exist_ok=True)
with open('utils/distributed.py', 'w') as f:
    f.write("""# Custom distributed.py without mpi4py dependency
import os
import torch
import torch.distributed as dist

class MPI:
    class COMM_WORLD:
        @staticmethod
        def Get_rank():
            return 0
        @staticmethod
        def Get_size():
            return 1

def init_distributed(opt=None):
    if opt is not None:
        opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        opt.rank = 0
        opt.world_size = 1
        opt.gpu = 0
        return opt
    
    return None

def get_rank():
    return 0

def get_world_size():
    return 1

def is_main_process():
    return True

def synchronize():
    pass

def all_gather(data):
    return [data]

def reduce_dict(input_dict, average=True):
    return input_dict
""")
print("Created custom distributed.py")

# Install detectron2
print("Installing detectron2...")
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/MaureenZOU/detectron2-xyz.git"])
    print("Detectron2 installation complete!")
except Exception as e:
    print(f"Error installing detectron2: {e}")
    sys.exit(1)

# Set Python path to include the repository root
os.environ["PYTHONPATH"] = os.getcwd()
print(f"Set PYTHONPATH to: {os.getcwd()}")

# Continue with regular imports
import warnings
import PIL
from PIL import Image
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple

import gradio as gr
import torch
import argparse
import whisper
import numpy as np

from gradio import processing_utils
from modeling.BaseModel import BaseModel
from modeling import build_model
from utils.distributed import init_distributed
from utils.arguments import load_opt_from_config_files
from utils.constants import COCO_PANOPTIC_CLASSES

from demo.seem.tasks import *

def parse_option():
    parser = argparse.ArgumentParser('SEEM Demo', add_help=False)
    parser.add_argument('--conf_files', default="configs/seem/focall_unicl_lang_demo.yaml", metavar="FILE", help='path to config file', )
    cfg = parser.parse_args()
    return cfg

'''
build args
'''
cfg = parse_option()
opt = load_opt_from_config_files([cfg.conf_files])
opt = init_distributed(opt)

# META DATA
cur_model = 'None'
if 'focalt' in cfg.conf_files:
    pretrained_pth = os.path.join("seem_focalt_v0.pt")
    if not os.path.exists(pretrained_pth):
        os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focalt_v0.pt"))
    cur_model = 'Focal-T'
elif 'focal' in cfg.conf_files:
    pretrained_pth = os.path.join("seem_focall_v0.pt")
    if not os.path.exists(pretrained_pth):
        os.system("wget {}".format("https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v0.pt"))
    cur_model = 'Focal-L'

'''
build model
'''
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth).eval().to(device)
with torch.no_grad():
    model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(COCO_PANOPTIC_CLASSES + ["background"], is_eval=True)

'''
audio
'''
audio = whisper.load_model("base")

@torch.no_grad()
def inference(image, task, *args, **kwargs):
    if torch.cuda.is_available():
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            if 'Video' in task:
                return interactive_infer_video(model, audio, image, task, *args, **kwargs)
            else:
                return interactive_infer_image(model, audio, image, task, *args, **kwargs)
    else:
        # Run without autocast on CPU
        if 'Video' in task:
            return interactive_infer_video(model, audio, image, task, *args, **kwargs)
        else:
            return interactive_infer_image(model, audio, image, task, *args, **kwargs)

class ImageMask(gr.components.Image):
    """
    Sets: source="canvas", tool="sketch"
    """

    is_template = True

    def __init__(self, **kwargs):
        super().__init__(source="upload", tool="sketch", interactive=True, **kwargs)

    def preprocess(self, x):
        return super().preprocess(x)

class Video(gr.components.Video):
    """
    Sets: source="canvas", tool="sketch"
    """

    is_template = True

    def __init__(self, **kwargs):
        super().__init__(source="upload", **kwargs)

    def preprocess(self, x):
        return super().preprocess(x)


'''
launch app
'''
title = "SEEM: Segment Everything Everywhere All At Once"
description = """
<div style="text-align: center; font-weight: bold;">
    <span style="font-size: 18px" id="paper-info">
        [<a href="https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once" target="_blank">GitHub</a>]
        [<a href="https://arxiv.org/pdf/2304.06718.pdf" target="_blank">arXiv</a>]
    </span>
</div>
<div style="text-align: left; font-weight: bold;">
    <br>
    &#x1F32A Note: The current model is run on <span style="color:blue;">SEEM {}</span>, for <span style="color:blue;">best performance</span> refer to <a href="https://huggingface.co/spaces/xdecoder/SEEM" target="_blank"><span style="color:red;">our demo</span></a>.
    </p>
</div>
""".format(cur_model)

'''Usage
Instructions:
&#x1F388 Try our default examples first (Sketch is not automatically drawed on input and example image);
&#x1F388 For video demo, it takes about 30-60s to process, please refresh if you meet an error on uploading;
&#x1F388 Upload an image/video (If you want to use referred region of another image please check "Example" and upload another image in referring image panel);
&#x1F388 Select at least one type of prompt of your choice (If you want to use referred region of another image please check "Example");
&#x1F388 Remember to provide the actual prompt for each promt type you select, otherwise you will meet an error (e.g., rember to draw on the referring image);
&#x1F388 Our model by default support the vocabulary of COCO 133 categories, others will be classified to 'others' or misclassifed.
'''

article = "The Demo is Run on SEEM-Tiny."
inputs = [ImageMask(label="[Stroke] Draw on Image",type="pil"), gr.inputs.CheckboxGroup(choices=["Stroke", "Example", "Text", "Audio", "Video", "Panoptic"], type="value", label="Interative Mode"), ImageMask(label="[Example] Draw on Referring Image",type="pil"), gr.Textbox(label="[Text] Referring Text"), gr.Audio(label="[Audio] Referring Audio", source="microphone", type="filepath"), gr.Video(label="[Video] Referring Video Segmentation",format="mp4",interactive=True)]
gr.Interface(
    fn=inference,
    inputs=inputs,
    outputs=[
        gr.outputs.Image(
        type="pil",
        label="Segmentation Results (COCO classes as label)"),
        gr.Video(
        label="Video Segmentation Results (COCO classes as label)", format="mp4"
        ),
    ],
    examples=[
    ["examples/corgi1.webp", ["Text"], "examples/corgi2.jpg", "The corgi.", None, None],
    ["examples/river1.png", ["Text", "Audio"], "examples/river2.png", "The green trees.", "examples/river1.wav", None],
    ["examples/zebras1.jpg", ["Example"], "examples/zebras2.jpg", "", None, None],
    ["examples/fries1.png", ["Example"], "examples/fries2.png", "", None, None],
    ["examples/placeholder.png", ["Video"], "examples/ref_vase.JPG", "", None, "examples/vasedeck.mp4"],
    ],
    title=title,
    description=description,
    article=article,
    allow_flagging='never',
    cache_examples=False,
).launch(share=True)