File size: 9,554 Bytes
5bec700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94c9a29
 
5bec700
94c9a29
979e017
94c9a29
979e017
94c9a29
5bec700
94c9a29
5bec700
94c9a29
979e017
94c9a29
979e017
5bec700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94c9a29
5bec700
 
 
 
 
 
94c9a29
5bec700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94c9a29
5bec700
94c9a29
 
 
5bec700
 
94c9a29
5bec700
 
94c9a29
 
 
 
 
 
 
 
 
 
5bec700
94c9a29
5bec700
 
94c9a29
5bec700
94c9a29
5bec700
94c9a29
 
 
5bec700
 
94c9a29
5bec700
 
 
 
94c9a29
5bec700
 
94c9a29
5bec700
94c9a29
5bec700
 
 
 
 
 
 
 
 
 
 
 
 
 
94c9a29
 
 
 
 
 
 
 
 
 
 
5bec700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import os
import uuid
from omegaconf import OmegaConf
import spaces

import random

import imageio
import torch
import torchvision
import gradio as gr
import numpy as np
from gradio.components import Textbox, Video

from utils.common_utils import load_model_checkpoint
from utils.utils import instantiate_from_config
from scheduler.t2v_turbo_scheduler import T2VTurboScheduler
from pipeline.t2v_turbo_vc2_pipeline import T2VTurboVC2Pipeline

DESCRIPTION = """# T2V-Turbo-v2 🚀
## A fast and efficient txt2video model that doesn't suck

This space was forked from the original so that I can fix whatever is causing its API not to work with HuggingChat's tools interface....

You know, because it would be really cool to combine an LLM with a text2video model that's fast, decent quality, and open source

I've also increased upper bounds of some params, and made other params adjustable in the UI which previously were locked. Please read the info because some of them are likely not worth messing with, but I like to give users the freedom to explore

The TLDR on this model is that it was distilled from VideoCrafter 2, and ended up beating the parent model on all of the benchmarks even tho its smaller and MUCH faster.

Don't get TOO excited tho - when you read the paper they claim it beat Kling and Runway Gen-3 on comprehensive benchmark scores, but this ain't Gen-3, its just not. Its a low res, high efficiency, txt2video engine that's perfect for recreational use and integration with chatbots, but it won't be winning any oscars

Official Project Page with links to Papers, Github Code, and Leaderboard:
[Project page for T2V-Turbo-v2](https://t2v-turbo-v2.github.io) 🤓
"""
if torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CUDA 😀</p>"
elif hasattr(torch, "xpu") and torch.xpu.is_available():
    DESCRIPTION += "\n<p>Running on XPU 🤓</p>"
else:
    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"

MAX_SEED = np.iinfo(np.int32).max


def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed


def save_video(video_array, video_save_path, fps: int = 16):
    video = video_array.detach().cpu()
    video = torch.clamp(video.float(), -1.0, 1.0)
    video = video.permute(1, 0, 2, 3)  # t,c,h,w
    video = (video + 1.0) / 2.0
    video = (video * 255).to(torch.uint8).permute(0, 2, 3, 1)

    torchvision.io.write_video(
        video_save_path, video, fps=fps, video_codec="h264", options={"crf": "10"}
    )

example_txt = [
    "An astronaut riding a horse.",
    "Darth vader surfing in waves.",
    "light wind, feathers moving, she moves her gaze, 4k",
    "a girl floating underwater.",
    "Pikachu snowboarding.",
    "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
    "A musician strums his guitar, serenading the moonlit night.",
]

examples = [[i, 7.5, 0.5, 0.05, 16, 16, 0, True, "bf16", 8] for i in example_txt]

@spaces.GPU(duration=120)
@torch.inference_mode()
def generate(
    prompt: str,
    guidance_scale: float = 7.5,
    motion_gs: float = 0.05,
    percentage: float = 0.5,
    num_inference_steps: int = 4,
    num_frames: int = 16,
    seed: int = 0,
    randomize_seed: bool = False,
    param_dtype="bf16",
    fps: int = 8,
):

    seed = randomize_seed_fn(seed, randomize_seed)
    torch.manual_seed(seed)

    if param_dtype == "bf16":
        dtype = torch.bfloat16
        unet.dtype = torch.bfloat16
    elif param_dtype == "fp16":
        dtype = torch.float16
        unet.dtype = torch.float16
    elif param_dtype == "fp32":
        dtype = torch.float32
        unet.dtype = torch.float32
    else:
        raise ValueError(f"Unknown dtype: {param_dtype}")

    pipeline.unet.to(device, dtype)
    pipeline.text_encoder.to(device, dtype)
    pipeline.vae.to(device, dtype)
    pipeline.to(device, dtype)

    result = pipeline(
        prompt=prompt,
        frames=num_frames,
        fps=fps,
        guidance_scale=guidance_scale,
        motion_gs=motion_gs,
        use_motion_cond=True,
        percentage=percentage,
        num_inference_steps=num_inference_steps,
        lcm_origin_steps=200,
        num_videos_per_prompt=1,
    )

    torch.cuda.empty_cache()
    tmp_save_path = "tmp.mp4"
    root_path = "./videos/"
    os.makedirs(root_path, exist_ok=True)
    video_save_path = os.path.join(root_path, tmp_save_path)

    save_video(result[0], video_save_path, fps=fps)
    display_model_info = f"Video size: {num_frames}x320x512, Sampling Step: {num_inference_steps}, Guidance Scale: {guidance_scale}"
    return video_save_path, prompt, display_model_info, seed


block_css = """
#buttons button {
    min-width: min(120px,100%);
}
"""


if __name__ == "__main__":
    device = torch.device("cuda:0")

    config = OmegaConf.load("configs/inference_t2v_512_v2.0.yaml")
    model_config = config.pop("model", OmegaConf.create())
    pretrained_t2v = instantiate_from_config(model_config)
    pretrained_t2v = load_model_checkpoint(pretrained_t2v, "checkpoints/VideoCrafter2_model.ckpt")
    
    unet_config = model_config["params"]["unet_config"]
    unet_config["params"]["use_checkpoint"] = False
    unet_config["params"]["time_cond_proj_dim"] = 256
    unet_config["params"]["motion_cond_proj_dim"] = 256

    unet = instantiate_from_config(unet_config)

    unet.load_state_dict(torch.load("checkpoints/unet_mg.pt", map_location=device))
    unet.eval()

    pretrained_t2v.model.diffusion_model = unet
    scheduler = T2VTurboScheduler(
        linear_start=model_config["params"]["linear_start"],
        linear_end=model_config["params"]["linear_end"],
    )
    pipeline = T2VTurboVC2Pipeline(pretrained_t2v, scheduler, model_config)
    pipeline.to(device)

    demo = gr.Interface(
        fn=generate,
        inputs=[
            Textbox(label="", placeholder="Please enter your prompt"),
            gr.Slider(
                label="CFG Guidance",
                minimum=1,
                maximum=21,
                step=0.1,
                value=7.5,
                info="Behaves like CFG Guidance on a txt2img diffusion model... 7.5 appears to indeed be the sweeet spot, but for certain prompts you may wish to adjust"
            ),
            gr.Slider(
                label="MGS Guidance (Don't Change This)",
                minimum=0.0,
                maximum=1.0,
                step=0.01,
                value=0.05,
                info="No idea where they came up with the default of 0.05 or why they're so certain its optimal, since its not mentioned in the paper. I've therefore opened it up for experimentation, with very low expectations"
            ),

            gr.Slider(
                label="Motion Guidance Percentage (Don't Change This)",
                minimum=0.0,
                maximum=0.8,
                step=0.05,
                value=0.5,
                info="The authors specifically say in their paper that its important to apply MG to only the first N inference steps out of M total step. But the ideal value of N/M is not mentioned, so may be worth playing with"
            ),

            gr.Slider(
                label="Inference Steps",
                minimum=2,
                maximum=200,
                step=1,
                value=16,
                info="This is an interesting one because increasing step count is the equivalent to techniques like CoT that we use to increase test time compute in LLMs. In general, more steps = lower loss (higher quality). But the relationship is asymptotic and returns quickly diminish... Opened this up in case its needed for certain use cases, otherwise leave @ 16"
            ),
            gr.Slider(
                label="Number of Video Frames",
                minimum=16,
                maximum=96,
                step=8,
                value=16,
                info="Generated video length = number of frames / FPS. The benchmark evals involved 16 frames, to my knowledge. It is unclear how high you can go before consistency falls apart... but it would be lovely to get 96 frames at 24 fps of high quality video. Probably won't happen, but just in case, feel free to try"
            ),

            gr.Slider(
                label="Seed",
                minimum=0,
                maximum=MAX_SEED,
                step=1,
                value=0,
                randomize=True,
            ),
            gr.Checkbox(label="Randomize seed", value=True),
            gr.Radio(
                ["bf16", "fp16", "fp32"],
                label="torch.dtype",
                value="bf16",
                interactive=True,
                info="bf16 is fast and high quality. end users should not change this setting",
            ),
            gr.Slider(
                label="Desired Output FPS",
                minimum=8,
                maximum=24,
                step=8,
                value=8,
                info="Higher = smoother, lower = longer video, purely a matter of preference"
            ),

        ],
        outputs=[
            gr.Video(label="Generated Video", width=512, height=320, interactive=False, autoplay=True),
            Textbox(label="input prompt"),
            Textbox(label="model info"),
            gr.Slider(label="seed"),
        ],
        description=DESCRIPTION,
        theme=gr.themes.Default(),
        css=block_css,
        examples=examples,
        cache_examples=False,
        concurrency_limit=10,
    )
    demo.launch()