File size: 6,729 Bytes
353c01a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f5b2bc
c866eb2
 
 
 
3099000
353c01a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a991a4
 
 
 
353c01a
c866eb2
fcac98a
4a991a4
 
 
c866eb2
4a991a4
 
44fbca1
4a991a4
 
c866eb2
44fbca1
fcac98a
44fbca1
 
353c01a
44fbca1
4a991a4
 
fcac98a
44fbca1
 
 
fcac98a
353c01a
 
4a991a4
 
 
 
 
 
 
 
 
 
 
353c01a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth')))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'refer')))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'stable-diffusion')))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src/taming-transformers')))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'src/clip')))


os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth')))


import cv2
import numpy as np
import torch
from depth.models_depth.model import EVPDepth
from models_refer.model import EVPRefer
from depth.configs.train_options import TrainOptions
from depth.configs.test_options import TestOptions
import glob
import utils
import torchvision.transforms as transforms
from utils_depth.misc import colorize
from PIL import Image
import torch.nn.functional as F
import gradio as gr
import tempfile
from transformers import CLIPTokenizer, AutoModel


css = """

#img-display-container {

    max-height: 50vh;

    }

#img-display-input {

    max-height: 40vh;

    }

#img-display-output {

    max-height: 40vh;

    }



"""

def create_depth_demo(model, device):
    gr.Markdown("### Depth Prediction demo")
    with gr.Row():
        input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
        depth_image = gr.Image(label="Depth Map", elem_id='img-display-output')
    raw_file = gr.File(label="16-bit raw depth, multiplier:256")
    submit = gr.Button("Submit")
    
    def on_submit(image):
        transform = transforms.ToTensor()
        image = transform(image).unsqueeze(0).to(device)
        shape = image.shape
        image = torch.nn.functional.interpolate(image, (440,480), mode='bilinear', align_corners=True)
        image = F.pad(image, (0, 0, 40, 0))
        with torch.no_grad():
            pred = model(image)#['pred_d']
            pred = torch.from_numpy(pred).to(device).float()
           
        if pred.dim() == 2:         # H×W
            pred = pred.unsqueeze(0).unsqueeze(0)
    
        pred = pred[:,:,40:,:]
        pred = torch.nn.functional.interpolate(pred, shape[2:], mode='bilinear', align_corners=True)
        pred_d_numpy = pred.squeeze().cpu().numpy()
        colored_depth, _, _ = colorize(pred_d_numpy, cmap='gray_r')
        
        tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
        raw_depth = Image.fromarray((pred_d_numpy*256).astype('uint16'))
        raw_depth.save(tmp.name)
        return [colored_depth, tmp.name]

    submit.click(on_submit, inputs=[input_image], outputs=[depth_image, raw_file])
    examples = gr.Examples(examples=["imgs/test_img1.jpg", "imgs/test_img2.jpg", "imgs/test_img3.jpg", "imgs/test_img4.jpg", "imgs/test_img5.jpg"],
                           inputs=[input_image])


def create_refseg_demo(model, tokenizer, device):
    gr.Markdown("### Referring Segmentation demo")
    with gr.Row():
        input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
        refseg_image = gr.Image(label="Output Mask", elem_id='img-display-output')
    input_text = gr.Textbox(label='Prompt', placeholder='Please upload your image first', lines=2)
    submit = gr.Button("Submit")
    
    def on_submit(image, text):
        # Convert PIL -> np array
        transform = transforms.ToTensor()
        image_t = transform(image).unsqueeze(0).to(device)

        with torch.no_grad():
            out = model(image_t, text)

        # Ensure numpy mask
        if isinstance(out, torch.Tensor):
            mask = out.squeeze().detach().cpu().numpy()
        else:
            mask = out

        # If model returns multi-channel, collapse with argmax
        if mask.ndim > 2:
            mask = np.argmax(mask, axis=0)

        mask = mask.astype(np.uint8)

        # Overlay mask on original image
        image_np = np.array(image).copy()
        alpha = 0.65
        image_np[mask == 0] = (image_np[mask == 0] * alpha).astype(np.uint8)

        # Draw contours
        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        cv2.drawContours(image_np, contours, -1, (0, 255, 0), 2)

        return Image.fromarray(image_np)


    submit.click(on_submit, inputs=[input_image, input_text], outputs=refseg_image)
    examples = gr.Examples(
        examples=[
            ["imgs/test_img2.jpg", "green plant"],
            ["imgs/test_img3.jpg", "chair"],
            ["imgs/test_img4.jpg", "left green plant"],
            ["imgs/test_img5.jpg", "man walking on foot"],
            ["imgs/test_img5.jpg", "the rightest camel"],
        ],
        inputs=[input_image, input_text]
    )

                           

def main():
    upload_2_models = True

    opt = TestOptions().initialize()
    args = opt.parse_args()
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if upload_2_models:
        model = AutoModel.from_pretrained("MykolaL/evp_depth", trust_remote_code=True).to(device).eval()
    
    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
    model_refseg = AutoModel.from_pretrained("MykolaL/evp_refer", trust_remote_code=True).to(device).eval()
    print('Models uploaded successfully')
    
    title = "# EVP"
    description = """Official demo for **EVP: Enhanced Visual Perception using Inverse Multi-Attentive Feature

    Refinement and Regularized Image-Text Alignment**.

    EVP is a deep learning model for metric depth estimation from a single image as well as referring segmentation.

    Please refer to our [project page](https://lavreniuk.github.io/EVP) or [paper](https://arxiv.org/abs/2312.08548) or [github](https://github.com/Lavreniuk/EVP) for more details."""

    with gr.Blocks() as demo:
        gr.Markdown(title)
        gr.Markdown(description)
        if upload_2_models:
            with gr.Tab("Depth Prediction"):
                create_depth_demo(model, device)
        with gr.Tab("Referring Segmentation"):
            create_refseg_demo(model_refseg, tokenizer, device)
        gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:<a href="https://huggingface.co/spaces/MykolaL/evp?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>

                <p><img src="https://visitor-badge.glitch.me/badge?page_id=MykolaL/evp" alt="visitors"></p></center>''')

    demo.queue().launch(share=True)


if __name__ == '__main__':
    main()