Spaces:

multimodalart
/

stable-video-diffusion

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Nov 28, 2023

Commit

9ca6c30

1 Parent(s): a7729a1

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -208

app.py CHANGED Viewed

@@ -1,73 +1,27 @@
-import math
 import os
 from glob import glob
 from pathlib import Path
 from typing import Optional
-import cv2
-import numpy as np
-import torch
-from einops import rearrange, repeat
-from fire import Fire
-from omegaconf import OmegaConf
 from PIL import Image
-from torchvision.transforms import ToTensor
-from scripts.util.detection.nsfw_and_watermark_dectection import \
-    DeepFloydDataFiltering
-from sgm.inference.helpers import embed_watermark
-from sgm.util import default, instantiate_from_config
-import gradio as gr
 import uuid
 import random
 from huggingface_hub import hf_hub_download
-hf_hub_download(repo_id="stabilityai/stable-video-diffusion-img2vid-xt", filename="svd_xt.safetensors", local_dir="checkpoints")
-version = "svd_xt"
-device = "cuda"
 max_64_bit_int = 2**63 - 1
-def load_model(
-    config: str,
-    device: str,
-    num_frames: int,
-    num_steps: int,
-):
-    config = OmegaConf.load(config)
-    if device == "cuda":
-        config.model.params.conditioner_config.params.emb_models[
-            0
-        ].params.open_clip_embedding_config.params.init_device = device
-    config.model.params.sampler_config.params.num_steps = num_steps
-    config.model.params.sampler_config.params.guider_config.params.num_frames = (
-        num_frames
-    )
-    if device == "cuda":
-        with torch.device(device):
-            model = instantiate_from_config(config.model).to(device).eval()
-    else:
-        model = instantiate_from_config(config.model).to(device).eval()
-    filter = DeepFloydDataFiltering(verbose=False, device=device)
-    return model, filter
-if version == "svd_xt":
-    num_frames = 25
-    num_steps = 30
-    model_config = "scripts/sampling/configs/svd_xt.yaml"
-else:
-    raise ValueError(f"Version {version} does not exist.")
-model, filter = load_model(
-    model_config,
-    device,
-    num_frames,
-    num_steps,
-)
 def sample(
     image: Image,
     seed: Optional[int] = None,
@@ -76,168 +30,28 @@ def sample(
     fps_id: int = 6,
     version: str = "svd_xt",
     cond_aug: float = 0.02,
-    decoding_t: int = 5,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
     device: str = "cuda",
     output_folder: str = "outputs",
-    progress=gr.Progress(track_tqdm=True)
 ):
     if(randomize_seed):
         seed = random.randint(0, max_64_bit_int)
-    torch.manual_seed(seed)
-    if image.mode == "RGBA":
-        image = image.convert("RGB")
-    w, h = image.size
-    if h % 64 != 0 or w % 64 != 0:
-        width, height = map(lambda x: x - x % 64, (w, h))
-        image = image.resize((width, height))
-        print(
-            f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
-        )
-    image = ToTensor()(image)
-    image = image * 2.0 - 1.0
-    image = image.unsqueeze(0).to(device)
-    H, W = image.shape[2:]
-    assert image.shape[1] == 3
-    F = 8
-    C = 4
-    shape = (num_frames, C, H // F, W // F)
-    if (H, W) != (576, 1024):
-        print(
-            "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
-        )
-    if motion_bucket_id > 255:
-        print(
-            "WARNING: High motion bucket! This may lead to suboptimal performance."
-        )
-    if fps_id < 5:
-        print("WARNING: Small fps value! This may lead to suboptimal performance.")
-    if fps_id > 30:
-        print("WARNING: Large fps value! This may lead to suboptimal performance.")
-    value_dict = {}
-    value_dict["motion_bucket_id"] = motion_bucket_id
-    value_dict["fps_id"] = fps_id
-    value_dict["cond_aug"] = cond_aug
-    value_dict["cond_frames_without_noise"] = image
-    value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
-    value_dict["cond_aug"] = cond_aug
-    with torch.no_grad():
-        with torch.autocast(device):
-            batch, batch_uc = get_batch(
-                get_unique_embedder_keys_from_conditioner(model.conditioner),
-                value_dict,
-                [1, num_frames],
-                T=num_frames,
-                device=device,
-            )
-            c, uc = model.conditioner.get_unconditional_conditioning(
-                batch,
-                batch_uc=batch_uc,
-                force_uc_zero_embeddings=[
-                    "cond_frames",
-                    "cond_frames_without_noise",
-                ],
-            )
-            for k in ["crossattn", "concat"]:
-                uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
-                uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
-                c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
-                c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
-            randn = torch.randn(shape, device=device)
-            additional_model_inputs = {}
-            additional_model_inputs["image_only_indicator"] = torch.zeros(
-                2, num_frames
-            ).to(device)
-            additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
-            def denoiser(input, sigma, c):
-                return model.denoiser(
-                    model.model, input, sigma, c, **additional_model_inputs
-                )
-            samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
-            model.en_and_decode_n_samples_a_time = decoding_t
-            samples_x = model.decode_first_stage(samples_z)
-            samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
-            os.makedirs(output_folder, exist_ok=True)
-            base_count = len(glob(os.path.join(output_folder, "*.mp4")))
-            video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
-            writer = cv2.VideoWriter(
-                video_path,
-                cv2.VideoWriter_fourcc(*"mp4v"),
-                fps_id + 1,
-                (samples.shape[-1], samples.shape[-2]),
-            )
-            samples = embed_watermark(samples)
-            samples = filter(samples)
-            vid = (
-                (rearrange(samples, "t c h w -> t h w c") * 255)
-                .cpu()
-                .numpy()
-                .astype(np.uint8)
-            )
-            for frame in vid:
-                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-                writer.write(frame)
-            writer.release()
     return video_path, seed
-def get_unique_embedder_keys_from_conditioner(conditioner):
-    return list(set([x.input_key for x in conditioner.embedders]))
-def get_batch(keys, value_dict, N, T, device):
-    batch = {}
-    batch_uc = {}
-    for key in keys:
-        if key == "fps_id":
-            batch[key] = (
-                torch.tensor([value_dict["fps_id"]])
-                .to(device)
-                .repeat(int(math.prod(N)))
-            )
-        elif key == "motion_bucket_id":
-            batch[key] = (
-                torch.tensor([value_dict["motion_bucket_id"]])
-                .to(device)
-                .repeat(int(math.prod(N)))
-            )
-        elif key == "cond_aug":
-            batch[key] = repeat(
-                torch.tensor([value_dict["cond_aug"]]).to(device),
-                "1 -> b",
-                b=math.prod(N),
-            )
-        elif key == "cond_frames":
-            batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
-        elif key == "cond_frames_without_noise":
-            batch[key] = repeat(
-                value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
-            )
-        else:
-            batch[key] = value_dict[key]
-    if T is not None:
-        batch["num_video_frames"] = T
-    for key in batch.keys():
-        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
-            batch_uc[key] = torch.clone(batch[key])
-    return batch, batch_uc
 def resize_image(image, output_size=(1024, 576)):
     # Calculate aspect ratios
     target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size

+import gradio as gr
+import torch
 import os
 from glob import glob
 from pathlib import Path
 from typing import Optional
+from diffusers import StableVideoDiffusionPipeline
+from diffusers.utils import load_image, export_to_video
 from PIL import Image
 import uuid
 import random
 from huggingface_hub import hf_hub_download
+pipe = StableVideoDiffusionPipeline.from_pretrained(
+    "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
+)
+pipe.to("cuda")
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
 max_64_bit_int = 2**63 - 1
 def sample(
     image: Image,
     seed: Optional[int] = None,
     fps_id: int = 6,
     version: str = "svd_xt",
     cond_aug: float = 0.02,
+    decoding_t: int = 4,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
     device: str = "cuda",
     output_folder: str = "outputs",
+    #progress=gr.Progress(track_tqdm=True)
 ):
+    if image.mode == "RGBA":
+        image = image.convert("RGB")
     if(randomize_seed):
         seed = random.randint(0, max_64_bit_int)
+    generator = torch.manual_seed(seed)
+    os.makedirs(output_folder, exist_ok=True)
+    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
+    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
+    frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1).frames[0]
+    export_to_video(frames, video_path, fps=fps_id)
+    torch.manual_seed(seed)
     return video_path, seed
 def resize_image(image, output_size=(1024, 576)):
     # Calculate aspect ratios
     target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size