Spaces:

innat
/

VideoMAE

Running

File size: 4,118 Bytes

import gradio as gr
import numpy as np
import imageio

import tensorflow as tf
from tensorflow import keras

from utils import TubeMaskingGenerator
from utils import read_video, frame_sampling, denormalize, reconstrunction
from utils import IMAGENET_MEAN, IMAGENET_STD, num_frames, patch_size, input_size
from labels import K400_label_map, SSv2_label_map, UCF_label_map


def tube_mask_generator(mask_ratio):
    window_size = (
        num_frames // 2, 
        input_size // patch_size[0], 
        input_size // patch_size[1]
    )
    tube_mask = TubeMaskingGenerator(
        input_size=window_size, 
        mask_ratio=mask_ratio
    )
    make_bool = tube_mask()
    bool_masked_pos_tf = tf.constant(make_bool, dtype=tf.int32)
    bool_masked_pos_tf = tf.expand_dims(bool_masked_pos_tf, axis=0)
    bool_masked_pos_tf = tf.cast(bool_masked_pos_tf, tf.bool)
    return bool_masked_pos_tf


def get_model(data_type):
    ft_model = keras.models.load_model(MODELS[data_type][0])
    pt_model = keras.models.load_model(MODELS[data_type][1])
    label_map = {v: k for k, v in K400_label_map.items()}
    return ft_model, pt_model, label_map


def inference(video_file, dataset_type, mask_ratio):
    print('---------------------------')
    print(video_file)
    print(dataset_type)
    print(mask_ratio)
    print('---------------------------')
    
    container = read_video(video_file)
    frames = frame_sampling(container, num_frames=num_frames)
    bool_masked_pos_tf = tube_mask_generator(mask_ratio)
    ft_model, pt_model, label_map = get_model(dataset_type)
    ft_model.trainable = False
    pt_model.trainable = False

    # inference on fine-tune model
    outputs_ft = ft_model(frames[None, ...], training=False)
    probabilities = tf.nn.softmax(outputs_ft).numpy().squeeze(0)
    confidences = {
        label_map[i]: float(probabilities[i]) for i in np.argsort(probabilities)[::-1]
    }

    # inference on pre-trained model
    outputs_pt = pt_model(frames[None, ...], bool_masked_pos_tf, training=False)
    reconstruct_output, mask = reconstrunction(
        frames[None, ...], bool_masked_pos_tf, outputs_pt
    )

    # post process
    input_frame = denormalize(frames)
    input_mask = denormalize(mask[0] * frames)
    output_frame = denormalize(reconstruct_output)

    frames = []
    for frame_a, frame_b, frame_c in zip(input_frame, input_mask, output_frame):
        combined_frame = np.hstack([frame_a, frame_b, frame_c])
        frames.append(combined_frame)

    combined_gif = 'combined.gif'
    imageio.mimsave(combined_gif, frames, duration=300, loop=0)
    return confidences, combined_gif


def main():
    MODELS = {
        'K400': [
            './TFVideoMAE_S_K400_16x224_FT',
            './TFVideoMAE_S_K400_16x224_PT'
            ],
        'SSv2': [
            './TFVideoMAE_S_K400_16x224_FT',
            './TFVideoMAE_S_K400_16x224_PT'
            ],
        'UCF' : [
            './TFVideoMAE_S_K400_16x224_FT',
            './TFVideoMAE_S_K400_16x224_PT'
            ]
    }
    BENCHMARK_DATASETS = ['K400', 'SSv2', 'UCF']
    SAMPLE_EXAMPLES = [
        ["examples/k400.mp4", 'Kintetics-400'],
        ["examples/k400.mp4", 'SSv2'],
        ["examples/k400.mp4", 'UCF']
    ]

    iface = gr.Interface(
        fn=inference,
        inputs=[ 
            gr.Video(type="file", label="Input Video"),
            gr.Radio(
                BENCHMARK_DATASETS, 
                type='value',
                default=BENCHMARK_DATASETS[0],
                label='Dataset', 
            ),
            gr.Slider(
                0,
                1,
                step=0.05,
                default=0.5,
                label='Mask Ratio'
            )
        ],
        outputs=[
            gr.Label(num_top_classes=3, label='scores'),
            gr.Image(type="filepath", label='reconstructed')
        ],
        examples=SAMPLE_EXAMPLES,
        title="VideoMAE",
        description="Keras reimplementation of <a href='https://github.com/innat/VideoMAE'>VideoMAE</a> is presented here."
    )
    
    iface.launch()

if __name__ == '__main__':
    main()