wan2-1-VACE-fast2

Paused

App Files Files Community

linoyts HF Staff commited on Jul 25

Commit

9526215

verified ·

1 Parent(s): b268619

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -37

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-from diffusers import AutoencoderKLWan, WanVACEPipeline UniPCMultistepScheduler
 from diffusers.utils import export_to_video
 from transformers import CLIPVisionModel
 import gradio as gr
@@ -41,15 +41,15 @@ SLIDER_MIN_H, SLIDER_MAX_H = 128, 896
 SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
 MAX_SEED = np.iinfo(np.int32).max
-FIXED_FPS = 24
 MIN_FRAMES_MODEL = 8
 MAX_FRAMES_MODEL = 81
 # Default prompts for different modes
 MODE_PROMPTS = {
-    "Ref2V": "",
-    "FLF2V": "",
-    "Random2V": ""
 }
 default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
@@ -96,27 +96,49 @@ def update_prompt_from_mode(mode):
     """Update the prompt based on the selected mode"""
     return MODE_PROMPTS.get(mode, "")
-def process_images_for_mode(images, mode):
-    """Process images based on the selected mode"""
-    if not images or len(images) == 0:
-        return None
-    if mode == "Ref2V":
-        # Use the first image as reference
-        return images[0]
-    elif mode == "FLF2V":
-        # First and Last Frame: blend or interpolate between first and last image
-        if len(images) >= 2:
-            return None
-        else:
-            return images[0]
-    elif mode == "Random2V":
-        # Randomly select one image from the gallery
-        return images[0]
-    return images[0]
 def get_duration(gallery_images, mode, prompt, height, width,
                    negative_prompt, duration_seconds,
@@ -159,11 +181,10 @@ def generate_video(gallery_images, mode, prompt, height, width,
     if gallery_images is None or len(gallery_images) == 0:
         raise gr.Error("Please upload at least one image to the gallery.")
-    # Process images based on the selected mode
-    input_image = process_images_for_mode(gallery_images, mode)
-    if input_image is None:
-        raise gr.Error("Failed to process images for the selected mode.")
     target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
     target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
@@ -172,19 +193,33 @@ def generate_video(gallery_images, mode, prompt, height, width,
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
-    resized_image = input_image.resize((target_w, target_h))
-    # Mode-specific processing can be added here if needed
-    if mode == "FLF2V" and len(gallery_images) >= 2:
-        # You can add special handling for FLF2V mode here
-        # For example, use both first and last frames in some way
-        pass
     with torch.inference_mode():
         output_frames_list = pipe(
-            image=resized_image, prompt=prompt, negative_prompt=negative_prompt,
-            height=target_h, width=target_w, num_frames=num_frames,
-            guidance_scale=float(guidance_scale), num_inference_steps=int(steps),
             generator=torch.Generator(device="cuda").manual_seed(current_seed)
         ).frames[0]

 import torch
+from diffusers import AutoencoderKLWan, WanVACEPipeline, UniPCMultistepScheduler
 from diffusers.utils import export_to_video
 from transformers import CLIPVisionModel
 import gradio as gr
 SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
 MAX_SEED = np.iinfo(np.int32).max
+FIXED_FPS = 16
 MIN_FRAMES_MODEL = 8
 MAX_FRAMES_MODEL = 81
 # Default prompts for different modes
 MODE_PROMPTS = {
+    "Ref2V": "the playful penguin picks up the green cat eye sunglasses and puts them on",
+    "FLF2V": "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective.",
+    "Random2V": "Various different characters appear and disappear in a fast transition video showcasting their unique features and personalities. The video is about showcasing different dance styles, with each character performing a distinct dance move. The background is a vibrant, colorful stage with dynamic lighting that changes with each dance style. The camera captures close-ups of the dancers' expressions and movements. Highly dynamic, fast-paced music video, with quick cuts and transitions between characters, cinematic, vibrant colors"
 }
 default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
     """Update the prompt based on the selected mode"""
     return MODE_PROMPTS.get(mode, "")
+def prepare_video_and_mask_Ref2V( height: int, width: int, num_frames: int, img: PIL.Image.Image = None):
+    frames = []
+    # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
+    # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
+    # match the original code.
+    frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames))
+    mask_white = PIL.Image.new("L", (width, height), 255)
+    mask = [mask_white] * (num_frames)
+    return frames, mask
+def prepare_video_and_mask_FLF2V(first_img: PIL.Image.Image, last_img: PIL.Image.Image, height: int, width: int, num_frames: int):
+    first_img = first_img.resize((width, height))
+    last_img = last_img.resize((width, height))
+    frames = []
+    frames.append(first_img)
+    # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
+    # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
+    # match the original code.
+    frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 2))
+    frames.append(last_img)
+    mask_black = PIL.Image.new("L", (width, height), 0)
+    mask_white = PIL.Image.new("L", (width, height), 255)
+    mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
+    return frames, mask
+def prepare_video_and_mask_Random2V(images: List[PIL.Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
+    images = [img.resize((width, height)) for img in images]
+    # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
+    # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
+    # match the original code.
+    frames = [PIL.Image.new("RGB", (width, height), (128, 128, 128))] * num_frames
+    mask_black = PIL.Image.new("L", (width, height), 0)
+    mask_white = PIL.Image.new("L", (width, height), 255)
+    mask = [mask_white] * num_frames
+    for img, idx in zip(images, frame_indices):
+        assert idx < num_frames
+        frames[idx] = img
+        mask[idx] = mask_black
+    return frames, mask
 def get_duration(gallery_images, mode, prompt, height, width,
                    negative_prompt, duration_seconds,
     if gallery_images is None or len(gallery_images) == 0:
         raise gr.Error("Please upload at least one image to the gallery.")
+    if mode == "FLF2V" and len(gallery_images) >= 2:
+        gallery_images = gallery_images[:2]
+    elif mode == "FLF2V" and len(gallery_images) < 2:
+        raise gr.Error("only one image was supplied, but 2 are needed for FLF2V")
     target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
     target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
+    # Process images based on the selected mode
+    if mode == "FLF2V":
+        frames, mask = prepare_video_and_mask_FLF2V(first_img=gallery_images[0], last_img=gallery_images[1], height=target_h, width=target_w, num_frames=num_frames)
+        reference_images=None
+    elif mode == "Ref2V":
+        frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
+        reference_images =
+    else: # mode == "":
+        frames, mask = prepare_video_and_mask_Random2V(images=gallery_images, frame_indices=[0,15,40], height=target_h, width=target_w, num_frames=num_frames)
+        reference_images=None
+    # resized_image = input_image.resize((target_w, target_h))
     with torch.inference_mode():
         output_frames_list = pipe(
+            video=frames,
+            mask=mask,
+            reference_images=reference_images,
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=target_h,
+            width=target_w,
+            num_frames=num_frames,
+            guidance_scale=float(guidance_scale),
+            num_inference_steps=int(steps),
             generator=torch.Generator(device="cuda").manual_seed(current_seed)
         ).frames[0]