import os import uuid from omegaconf import OmegaConf import spaces import random import imageio import torch import torchvision import gradio as gr import numpy as np from gradio.components import Textbox, Video from utils.common_utils import load_model_checkpoint from utils.utils import instantiate_from_config from scheduler.t2v_turbo_scheduler import T2VTurboScheduler from pipeline.t2v_turbo_vc2_pipeline import T2VTurboVC2Pipeline DESCRIPTION = """# T2V-Turbo-v2 🚀 ## A fast and efficient txt2video model that doesn't suck This space was forked from the original so that I can fix whatever is causing its API not to work with HuggingChat's tools interface.... You know, because it would be really cool to combine an LLM with a text2video model that's fast, decent quality, and open source I've also increased upper bounds of some params, and made other params adjustable in the UI which previously were locked. Please read the info because some of them are likely not worth messing with, but I like to give users the freedom to explore The TLDR on this model is that it was distilled from VideoCrafter 2, and ended up beating the parent model on all of the benchmarks even tho its smaller and MUCH faster. Don't get TOO excited tho - when you read the paper they claim it beat Kling and Runway Gen-3 on comprehensive benchmark scores, but this ain't Gen-3, its just not. Its a low res, high efficiency, txt2video engine that's perfect for recreational use and integration with chatbots, but it won't be winning any oscars Official Project Page with links to Papers, Github Code, and Leaderboard: [Project page for T2V-Turbo-v2](https://t2v-turbo-v2.github.io) 🤓 """ if torch.cuda.is_available(): DESCRIPTION += "\n

Running on CUDA 😀

" elif hasattr(torch, "xpu") and torch.xpu.is_available(): DESCRIPTION += "\n

Running on XPU 🤓

" else: DESCRIPTION += "\n

Running on CPU 🥶 This demo does not work on CPU.

" MAX_SEED = np.iinfo(np.int32).max def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: if randomize_seed: seed = random.randint(0, MAX_SEED) return seed def save_video(video_array, video_save_path, fps: int = 16): video = video_array.detach().cpu() video = torch.clamp(video.float(), -1.0, 1.0) video = video.permute(1, 0, 2, 3) # t,c,h,w video = (video + 1.0) / 2.0 video = (video * 255).to(torch.uint8).permute(0, 2, 3, 1) torchvision.io.write_video( video_save_path, video, fps=fps, video_codec="h264", options={"crf": "10"} ) example_txt = [ "An astronaut riding a horse.", "Darth vader surfing in waves.", "light wind, feathers moving, she moves her gaze, 4k", "a girl floating underwater.", "Pikachu snowboarding.", "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k", "A musician strums his guitar, serenading the moonlit night.", ] examples = [[i, 7.5, 0.5, 0.05, 16, 16, 0, True, "bf16", 8] for i in example_txt] @spaces.GPU(duration=120) @torch.inference_mode() def generate( prompt: str, guidance_scale: float = 7.5, motion_gs: float = 0.05, percentage: float = 0.5, num_inference_steps: int = 4, num_frames: int = 16, seed: int = 0, randomize_seed: bool = False, param_dtype="bf16", fps: int = 8, ): seed = randomize_seed_fn(seed, randomize_seed) torch.manual_seed(seed) if param_dtype == "bf16": dtype = torch.bfloat16 unet.dtype = torch.bfloat16 elif param_dtype == "fp16": dtype = torch.float16 unet.dtype = torch.float16 elif param_dtype == "fp32": dtype = torch.float32 unet.dtype = torch.float32 else: raise ValueError(f"Unknown dtype: {param_dtype}") pipeline.unet.to(device, dtype) pipeline.text_encoder.to(device, dtype) pipeline.vae.to(device, dtype) pipeline.to(device, dtype) result = pipeline( prompt=prompt, frames=num_frames, fps=fps, guidance_scale=guidance_scale, motion_gs=motion_gs, use_motion_cond=True, percentage=percentage, num_inference_steps=num_inference_steps, lcm_origin_steps=200, num_videos_per_prompt=1, ) torch.cuda.empty_cache() tmp_save_path = "tmp.mp4" root_path = "./videos/" os.makedirs(root_path, exist_ok=True) video_save_path = os.path.join(root_path, tmp_save_path) save_video(result[0], video_save_path, fps=fps) display_model_info = f"Video size: {num_frames}x320x512, Sampling Step: {num_inference_steps}, Guidance Scale: {guidance_scale}" return video_save_path, prompt, display_model_info, seed block_css = """ #buttons button { min-width: min(120px,100%); } """ if __name__ == "__main__": device = torch.device("cuda:0") config = OmegaConf.load("configs/inference_t2v_512_v2.0.yaml") model_config = config.pop("model", OmegaConf.create()) pretrained_t2v = instantiate_from_config(model_config) pretrained_t2v = load_model_checkpoint(pretrained_t2v, "checkpoints/VideoCrafter2_model.ckpt") unet_config = model_config["params"]["unet_config"] unet_config["params"]["use_checkpoint"] = False unet_config["params"]["time_cond_proj_dim"] = 256 unet_config["params"]["motion_cond_proj_dim"] = 256 unet = instantiate_from_config(unet_config) unet.load_state_dict(torch.load("checkpoints/unet_mg.pt", map_location=device)) unet.eval() pretrained_t2v.model.diffusion_model = unet scheduler = T2VTurboScheduler( linear_start=model_config["params"]["linear_start"], linear_end=model_config["params"]["linear_end"], ) pipeline = T2VTurboVC2Pipeline(pretrained_t2v, scheduler, model_config) pipeline.to(device) demo = gr.Interface( fn=generate, inputs=[ Textbox(label="", placeholder="Please enter your prompt"), gr.Slider( label="CFG Guidance", minimum=1, maximum=21, step=0.1, value=7.5, info="Behaves like CFG Guidance on a txt2img diffusion model... 7.5 appears to indeed be the sweeet spot, but for certain prompts you may wish to adjust" ), gr.Slider( label="MGS Guidance (Don't Change This)", minimum=0.0, maximum=1.0, step=0.01, value=0.05, info="No idea where they came up with the default of 0.05 or why they're so certain its optimal, since its not mentioned in the paper. I've therefore opened it up for experimentation, with very low expectations" ), gr.Slider( label="Motion Guidance Percentage (Don't Change This)", minimum=0.0, maximum=0.8, step=0.05, value=0.5, info="The authors specifically say in their paper that its important to apply MG to only the first N inference steps out of M total step. But the ideal value of N/M is not mentioned, so may be worth playing with" ), gr.Slider( label="Inference Steps", minimum=2, maximum=200, step=1, value=16, info="This is an interesting one because increasing step count is the equivalent to techniques like CoT that we use to increase test time compute in LLMs. In general, more steps = lower loss (higher quality). But the relationship is asymptotic and returns quickly diminish... Opened this up in case its needed for certain use cases, otherwise leave @ 16" ), gr.Slider( label="Number of Video Frames", minimum=16, maximum=96, step=8, value=16, info="Generated video length = number of frames / FPS. The benchmark evals involved 16 frames, to my knowledge. It is unclear how high you can go before consistency falls apart... but it would be lovely to get 96 frames at 24 fps of high quality video. Probably won't happen, but just in case, feel free to try" ), gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, randomize=True, ), gr.Checkbox(label="Randomize seed", value=True), gr.Radio( ["bf16", "fp16", "fp32"], label="torch.dtype", value="bf16", interactive=True, info="bf16 is fast and high quality. end users should not change this setting", ), gr.Slider( label="Desired Output FPS", minimum=8, maximum=24, step=8, value=8, info="Higher = smoother, lower = longer video, purely a matter of preference" ), ], outputs=[ gr.Video(label="Generated Video", width=512, height=320, interactive=False, autoplay=True), Textbox(label="input prompt"), Textbox(label="model info"), gr.Slider(label="seed"), ], description=DESCRIPTION, theme=gr.themes.Default(), css=block_css, examples=examples, cache_examples=False, concurrency_limit=10, ) demo.launch()