File size: 5,008 Bytes
5d32408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# Adapted from https://github.com/luosiallen/latent-consistency-model
from __future__ import annotations

import os
import random

import numpy as np

from .pipeline.t2v_turbo_ms_pipeline import T2VTurboMSPipeline
from .scheduler.t2v_turbo_scheduler import T2VTurboScheduler
from .utils.common_utils import set_torch_2_attn

try:
    import intel_extension_for_pytorch as ipex
except:
    pass

from transformers import CLIPTokenizer, CLIPTextModel
from .model_scope.unet_3d_condition import UNet3DConditionModel

from .utils.lora import collapse_lora, monkeypatch_remove_lora
from .utils.lora_handler import LoraHandler

import torch
from diffusers.models import AutoencoderKL

DESCRIPTION = """# T2V-Turbo πŸš€
We provide T2V-Turbo (MS) distilled from [ModelScopeT2V](https://huggingface.co/ali-vilab/text-to-video-ms-1.7b/) with the reward feedback from [HPSv2.1](https://github.com/tgxs002/HPSv2/tree/master) and [ViCLIP](https://huggingface.co/OpenGVLab/ViCLIP).

You can download the the models from [here](https://huggingface.co/jiachenli-ucsb/T2V-Turbo-MS). Check out our [Project page](https://t2v-turbo.github.io) πŸ˜„
"""
if torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CUDA πŸ˜€</p>"
elif hasattr(torch, "xpu") and torch.xpu.is_available():
    DESCRIPTION += "\n<p>Running on XPU πŸ€“</p>"
else:
    DESCRIPTION += "\n<p>Running on CPU πŸ₯Ά This demo does not work on CPU.</p>"

MAX_SEED = np.iinfo(np.int32).max
CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES") == "1"
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE") == "1"

"""
Operation System Options:
    If you are using MacOS, please set the following (device="mps") ;
    If you are using Linux & Windows with Nvidia GPU, please set the device="cuda";
    If you are using Linux & Windows with Intel Arc GPU, please set the device="xpu";
"""
# device = "mps"    # MacOS
# device = "xpu"    # Intel Arc GPU
device = "cuda"  # Linux & Windows

"""
   DTYPE Options:
      To reduce GPU memory you can set "DTYPE=torch.float16",
      but image quality might be compromised
"""
DTYPE = (
    torch.float16
)  # torch.float16 works as well, but pictures seem to be a bit worse


def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    return seed


class T2VTurboMSPipeline1:
    def __init__(self, device, unet_dir, base_model_dir):
        pretrained_model_path = base_model_dir
        tokenizer = CLIPTokenizer.from_pretrained(
            pretrained_model_path, subfolder="tokenizer"
        )
        text_encoder = CLIPTextModel.from_pretrained(
            pretrained_model_path, subfolder="text_encoder"
        )
        vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
        teacher_unet = UNet3DConditionModel.from_pretrained(
            pretrained_model_path, subfolder="unet"
        )

        time_cond_proj_dim = 256
        unet = UNet3DConditionModel.from_config(
            teacher_unet.config, time_cond_proj_dim=time_cond_proj_dim
        )
        # load teacher_unet weights into unet
        unet.load_state_dict(teacher_unet.state_dict(), strict=False)
        del teacher_unet
        set_torch_2_attn(unet)
        use_unet_lora = True
        lora_manager = LoraHandler(
            version="cloneofsimo",
            use_unet_lora=use_unet_lora,
            save_for_webui=True,
        )
        lora_manager.add_lora_to_model(
            use_unet_lora,
            unet,
            lora_manager.unet_replace_modules,
            lora_path=unet_dir,
            dropout=0.1,
            r=32,
        )
        collapse_lora(unet, lora_manager.unet_replace_modules)
        monkeypatch_remove_lora(unet)
        unet.eval()

        noise_scheduler = T2VTurboScheduler()
        self.pipeline = T2VTurboMSPipeline(
            unet=unet,
            vae=vae,
            text_encoder=text_encoder,
            tokenizer=tokenizer,
            scheduler=noise_scheduler,
        )
        self.pipeline.to(device)

    def inference(
            self,
            prompt: str,
            height: int = 320,
            width: int = 512,
            seed: int = 0,
            guidance_scale: float = 7.5,
            num_inference_steps: int = 4,
            num_frames: int = 16,
            fps: int = 16,
            randomize_seed: bool = False,
            param_dtype="torch.float16"
    ):
        seed = randomize_seed_fn(seed, randomize_seed)
        torch.manual_seed(seed)
        self.pipeline.to(
            torch_device=device,
            torch_dtype=torch.float16 if param_dtype == "torch.float16" else torch.float32,
        )

        result = self.pipeline(
            prompt=prompt,
            height=height,
            width=width,
            frames=num_frames,
            guidance_scale=guidance_scale,
            num_inference_steps=num_inference_steps,
            num_videos_per_prompt=1,
        )

        return result