T2V-Turbo-V2

Running on Zero

File size: 9,554 Bytes

import os
import uuid
from omegaconf import OmegaConf
import spaces

import random

import imageio
import torch
import torchvision
import gradio as gr
import numpy as np
from gradio.components import Textbox, Video

from utils.common_utils import load_model_checkpoint
from utils.utils import instantiate_from_config
from scheduler.t2v_turbo_scheduler import T2VTurboScheduler
from pipeline.t2v_turbo_vc2_pipeline import T2VTurboVC2Pipeline

DESCRIPTION = """# T2V-Turbo-v2 🚀
## A fast and efficient txt2video model that doesn't suck

This space was forked from the original so that I can fix whatever is causing its API not to work with HuggingChat's tools interface....

You know, because it would be really cool to combine an LLM with a text2video model that's fast, decent quality, and open source

I've also increased upper bounds of some params, and made other params adjustable in the UI which previously were locked. Please read the info because some of them are likely not worth messing with, but I like to give users the freedom to explore

The TLDR on this model is that it was distilled from VideoCrafter 2, and ended up beating the parent model on all of the benchmarks even tho its smaller and MUCH faster.

Don't get TOO excited tho - when you read the paper they claim it beat Kling and Runway Gen-3 on comprehensive benchmark scores, but this ain't Gen-3, its just not. Its a low res, high efficiency, txt2video engine that's perfect for recreational use and integration with chatbots, but it won't be winning any oscars

Official Project Page with links to Papers, Github Code, and Leaderboard:
[Project page for T2V-Turbo-v2](https://t2v-turbo-v2.github.io) 🤓
"""
if torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CUDA 😀</p>"
elif hasattr(torch, "xpu") and torch.xpu.is_available():
    DESCRIPTION += "\n<p>Running on XPU 🤓</p>"
else:
    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"

MAX_SEED = np.iinfo(np.int32).max


def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed


def save_video(video_array, video_save_path, fps: int = 16):
    video = video_array.detach().cpu()
    video = torch.clamp(video.float(), -1.0, 1.0)
    video = video.permute(1, 0, 2, 3)  # t,c,h,w
    video = (video + 1.0) / 2.0
    video = (video * 255).to(torch.uint8).permute(0, 2, 3, 1)

    torchvision.io.write_video(
        video_save_path, video, fps=fps, video_codec="h264", options={"crf": "10"}
    )

example_txt = [
    "An astronaut riding a horse.",
    "Darth vader surfing in waves.",
    "light wind, feathers moving, she moves her gaze, 4k",
    "a girl floating underwater.",
    "Pikachu snowboarding.",
    "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
    "A musician strums his guitar, serenading the moonlit night.",
]

examples = [[i, 7.5, 0.5, 0.05, 16, 16, 0, True, "bf16", 8] for i in example_txt]

@spaces.GPU(duration=120)
@torch.inference_mode()
def generate(
    prompt: str,
    guidance_scale: float = 7.5,
    motion_gs: float = 0.05,
    percentage: float = 0.5,
    num_inference_steps: int = 4,
    num_frames: int = 16,
    seed: int = 0,
    randomize_seed: bool = False,
    param_dtype="bf16",
    fps: int = 8,
):

    seed = randomize_seed_fn(seed, randomize_seed)
    torch.manual_seed(seed)

    if param_dtype == "bf16":
        dtype = torch.bfloat16
        unet.dtype = torch.bfloat16
    elif param_dtype == "fp16":
        dtype = torch.float16
        unet.dtype = torch.float16
    elif param_dtype == "fp32":
        dtype = torch.float32
        unet.dtype = torch.float32
    else:
        raise ValueError(f"Unknown dtype: {param_dtype}")

    pipeline.unet.to(device, dtype)
    pipeline.text_encoder.to(device, dtype)
    pipeline.vae.to(device, dtype)
    pipeline.to(device, dtype)

    result = pipeline(
        prompt=prompt,
        frames=num_frames,
        fps=fps,
        guidance_scale=guidance_scale,
        motion_gs=motion_gs,
        use_motion_cond=True,
        percentage=percentage,
        num_inference_steps=num_inference_steps,
        lcm_origin_steps=200,
        num_videos_per_prompt=1,
    )

    torch.cuda.empty_cache()
    tmp_save_path = "tmp.mp4"
    root_path = "./videos/"
    os.makedirs(root_path, exist_ok=True)
    video_save_path = os.path.join(root_path, tmp_save_path)

    save_video(result[0], video_save_path, fps=fps)
    display_model_info = f"Video size: {num_frames}x320x512, Sampling Step: {num_inference_steps}, Guidance Scale: {guidance_scale}"
    return video_save_path, prompt, display_model_info, seed


block_css = """
#buttons button {
    min-width: min(120px,100%);
}
"""


if __name__ == "__main__":
    device = torch.device("cuda:0")

    config = OmegaConf.load("configs/inference_t2v_512_v2.0.yaml")
    model_config = config.pop("model", OmegaConf.create())
    pretrained_t2v = instantiate_from_config(model_config)
    pretrained_t2v = load_model_checkpoint(pretrained_t2v, "checkpoints/VideoCrafter2_model.ckpt")
    
    unet_config = model_config["params"]["unet_config"]
    unet_config["params"]["use_checkpoint"] = False
    unet_config["params"]["time_cond_proj_dim"] = 256
    unet_config["params"]["motion_cond_proj_dim"] = 256

    unet = instantiate_from_config(unet_config)

    unet.load_state_dict(torch.load("checkpoints/unet_mg.pt", map_location=device))
    unet.eval()

    pretrained_t2v.model.diffusion_model = unet
    scheduler = T2VTurboScheduler(
        linear_start=model_config["params"]["linear_start"],
        linear_end=model_config["params"]["linear_end"],
    )
    pipeline = T2VTurboVC2Pipeline(pretrained_t2v, scheduler, model_config)
    pipeline.to(device)

    demo = gr.Interface(
        fn=generate,
        inputs=[
            Textbox(label="", placeholder="Please enter your prompt"),
            gr.Slider(
                label="CFG Guidance",
                minimum=1,
                maximum=21,
                step=0.1,
                value=7.5,
                info="Behaves like CFG Guidance on a txt2img diffusion model... 7.5 appears to indeed be the sweeet spot, but for certain prompts you may wish to adjust"
            ),
            gr.Slider(
                label="MGS Guidance (Don't Change This)",
                minimum=0.0,
                maximum=1.0,
                step=0.01,
                value=0.05,
                info="No idea where they came up with the default of 0.05 or why they're so certain its optimal, since its not mentioned in the paper. I've therefore opened it up for experimentation, with very low expectations"
            ),

            gr.Slider(
                label="Motion Guidance Percentage (Don't Change This)",
                minimum=0.0,
                maximum=0.8,
                step=0.05,
                value=0.5,
                info="The authors specifically say in their paper that its important to apply MG to only the first N inference steps out of M total step. But the ideal value of N/M is not mentioned, so may be worth playing with"
            ),

            gr.Slider(
                label="Inference Steps",
                minimum=2,
                maximum=200,
                step=1,
                value=16,
                info="This is an interesting one because increasing step count is the equivalent to techniques like CoT that we use to increase test time compute in LLMs. In general, more steps = lower loss (higher quality). But the relationship is asymptotic and returns quickly diminish... Opened this up in case its needed for certain use cases, otherwise leave @ 16"
            ),
            gr.Slider(
                label="Number of Video Frames",
                minimum=16,
                maximum=96,
                step=8,
                value=16,
                info="Generated video length = number of frames / FPS. The benchmark evals involved 16 frames, to my knowledge. It is unclear how high you can go before consistency falls apart... but it would be lovely to get 96 frames at 24 fps of high quality video. Probably won't happen, but just in case, feel free to try"
            ),

            gr.Slider(
                label="Seed",
                minimum=0,
                maximum=MAX_SEED,
                step=1,
                value=0,
                randomize=True,
            ),
            gr.Checkbox(label="Randomize seed", value=True),
            gr.Radio(
                ["bf16", "fp16", "fp32"],
                label="torch.dtype",
                value="bf16",
                interactive=True,
                info="bf16 is fast and high quality. end users should not change this setting",
            ),
            gr.Slider(
                label="Desired Output FPS",
                minimum=8,
                maximum=24,
                step=8,
                value=8,
                info="Higher = smoother, lower = longer video, purely a matter of preference"
            ),

        ],
        outputs=[
            gr.Video(label="Generated Video", width=512, height=320, interactive=False, autoplay=True),
            Textbox(label="input prompt"),
            Textbox(label="model info"),
            gr.Slider(label="seed"),
        ],
        description=DESCRIPTION,
        theme=gr.themes.Default(),
        css=block_css,
        examples=examples,
        cache_examples=False,
        concurrency_limit=10,
    )
    demo.launch()