wan2-1-VACE-fast

Running on Zero

App Files Files Community

linoyts HF Staff commited on Jul 25

Commit

dc06ac2

verified ·

1 Parent(s): cade269

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -35

app.py CHANGED Viewed

@@ -21,12 +21,12 @@ pipe.load_lora_weights(
    weight_name="FusionX_LoRa/Phantom_Wan_14B_FusionX_LoRA.safetensors",
     adapter_name="phantom"
 )
-# pipe.load_lora_weights(
-#    "vrgamedevgirl84/Wan14BT2VFusioniX",
-#    weight_name="OtherLoRa's/DetailEnhancerV1.safetensors", adapter_name="detailer"
-# )
-# pipe.set_adapters(["phantom","detailer"], adapter_weights=[1, .9])
-# pipe.fuse_lora()
 MOD_VALUE = 32
 DEFAULT_H_SLIDER_VALUE = 512
@@ -77,7 +77,7 @@ def handle_gallery_upload_for_dims_wan(gallery_images, current_h_val, current_w_
         return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
     try:
         # Use the first image to calculate dimensions
-        first_image = gallery_images[0][0]
         new_h, new_w = _calculate_new_dimensions_wan(
             first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
             SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
@@ -176,8 +176,6 @@ def generate_video(gallery_images, mode, prompt, height, width,
     """
     if gallery_images is None or len(gallery_images) == 0:
         raise gr.Error("Please upload at least one image to the gallery.")
-    else:
-        gallery_images = [img[0] for img in gallery_images]
     if mode == "FLF2V" and len(gallery_images) >= 2:
         gallery_images = gallery_images[:2]
@@ -203,12 +201,26 @@ def generate_video(gallery_images, mode, prompt, height, width,
         reference_images = None
     elif mode == "Ref2V":
         frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
-        reference_images = gallery_images
     else:  # mode == "Random2V"
         frames, mask = prepare_video_and_mask_Random2V(
             images=gallery_images,
-            frame_indices=[0,20,40], # todo - generalize
             height=target_h,
             width=target_w,
             num_frames=num_frames
@@ -235,20 +247,9 @@ def generate_video(gallery_images, mode, prompt, height, width,
     export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
     return video_path, current_seed
-control_modes = """
-**3 control modes avilable:**
-**Ref2V (Reference-to-Video)** Generate a video incorporating elements from input reference images
-**FLF2V (First-Last Frame-to-Video)** Generate a video using first and last frame conditioning defined by input images
-**Random2V (Random-to-Video)** Generate a video with intermediate transitions between multiple input images
-"""
 with gr.Blocks() as demo:
-    gr.Markdown("# Fast 6 step Wan 2.1 VACE (14B)")
-    gr.Markdown("Using [**Wan2.1-VACE-14B**](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers) + [**👻FusionX Phantom LoRA**](https://huggingface.co/vrgamedevgirl84/Wan14BT2VFusioniX) by [**vrgamedevgirl84**](https://huggingface.co/vrgamedevgirl84) with **🧨diffusers**, for fast video generation with multiple conditions 🏎️")
-    gr.Markdown(f"{control_modes}")
     with gr.Row():
         with gr.Column():
@@ -269,8 +270,8 @@ with gr.Blocks() as demo:
             mode_radio = gr.Radio(
                 choices=["Ref2V", "FLF2V", "Random2V"],
                 value="Ref2V",
-                label="Control Mode",
-                info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random to Video"
             )
             prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
@@ -278,7 +279,7 @@ with gr.Blocks() as demo:
                 minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1),
                 maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1),
                 step=0.1,
-                value=2.8,
                 label="Duration (seconds)",
                 info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
             )
@@ -290,13 +291,22 @@ with gr.Blocks() as demo:
                 with gr.Row():
                     height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
                     width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
-                steps_slider = gr.Slider(minimum=1, maximum=10, step=1, value=6, label="Inference Steps")
                 guidance_scale_input = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Guidance Scale", visible=False)
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
             video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
     # Update prompt when mode changes
     mode_radio.change(
@@ -319,13 +329,6 @@ with gr.Blocks() as demo:
     ]
     generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
-    gr.Examples(
-        examples=[
-            [["reachy.png", "sunglasses.jpg", "gpu_hat.png"], "Ref2V", "the cute robot is wearing the sunglasses and the hat that reads 'GPU poor', and moves around playfully", 480, 832],
-            [["flf2v_input_first_frame.png", "flf2v_input_last_frame.png"], "FLF2V", "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective.", 512, 512],
-        ],
-        inputs=[gallery_component, mode_radio, prompt_input, height_input, width_input], outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
-    )
 if __name__ == "__main__":
     demo.queue().launch(mcp_server=True)

    weight_name="FusionX_LoRa/Phantom_Wan_14B_FusionX_LoRA.safetensors",
     adapter_name="phantom"
 )
+pipe.load_lora_weights(
+   "vrgamedevgirl84/Wan14BT2VFusioniX",
+   weight_name="OtherLoRa's/DetailEnhancerV1.safetensors", adapter_name="detailer"
+)
+pipe.set_adapters(["phantom","detailer"], adapter_weights=[1, .9])
+pipe.fuse_lora()
 MOD_VALUE = 32
 DEFAULT_H_SLIDER_VALUE = 512
         return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
     try:
         # Use the first image to calculate dimensions
+        first_image = gallery_images[0]
         new_h, new_w = _calculate_new_dimensions_wan(
             first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
             SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
     """
     if gallery_images is None or len(gallery_images) == 0:
         raise gr.Error("Please upload at least one image to the gallery.")
     if mode == "FLF2V" and len(gallery_images) >= 2:
         gallery_images = gallery_images[:2]
         reference_images = None
     elif mode == "Ref2V":
         frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
+        # Resize reference images to match target dimensions
+        reference_images = [img.resize((target_w, target_h)) for img in gallery_images]
     else:  # mode == "Random2V"
+        # Calculate appropriate frame indices based on number of images and frames
+        num_images = len(gallery_images)
+        if num_images == 1:
+            frame_indices = [num_frames // 2]  # Place single image in the middle
+        elif num_images == 2:
+            frame_indices = [0, num_frames - 1]  # Place at start and end
+        else:
+            # Distribute images evenly across the video
+            # Ensure we don't exceed available frames
+            max_images = min(num_images, num_frames)
+            step = max(1, num_frames // max_images)
+            frame_indices = [min(i * step, num_frames - 1) for i in range(max_images)]
+            gallery_images = gallery_images[:max_images]  # Limit images to what we can use
         frames, mask = prepare_video_and_mask_Random2V(
             images=gallery_images,
+            frame_indices=frame_indices,
             height=target_h,
             width=target_w,
             num_frames=num_frames
     export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
     return video_path, current_seed
 with gr.Blocks() as demo:
+    gr.Markdown("# Wan 2.1 VACE (14B) with Phantom & Detail Enhancer LoRAs - Multi-Image Gallery")
+    gr.Markdown("Using [Wan2.1-VACE-14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers) with Phantom FusionX and Detail Enhancer LoRAs for advanced video generation with multiple conditioning modes.")
     with gr.Row():
         with gr.Column():
             mode_radio = gr.Radio(
                 choices=["Ref2V", "FLF2V", "Random2V"],
                 value="Ref2V",
+                label="Processing Mode",
+                info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random frames to Video"
             )
             prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
                 minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1),
                 maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1),
                 step=0.1,
+                value=2,
                 label="Duration (seconds)",
                 info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
             )
                 with gr.Row():
                     height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
                     width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
+                steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="Inference Steps")
                 guidance_scale_input = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Guidance Scale", visible=False)
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
             video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
+            with gr.Accordion("Mode Information", open=True):
+                gr.Markdown("""
+                **Processing Modes:**
+                - **Ref2V**: Uses uploaded images as style references for video generation. All frames are generated based on the reference images.
+                - **FLF2V**: First-Last Frame mode - uses first and last images as keyframes and generates the frames in between (requires exactly 2 images)
+                - **Random2V**: Places uploaded images at specific frames in the video and generates the rest. Images are distributed evenly across the video duration.
+                **Note**: VACE pipeline supports advanced conditioning with masks and reference images for more control over generation.
+                """)
     # Update prompt when mode changes
     mode_radio.change(
     ]
     generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
 if __name__ == "__main__":
     demo.queue().launch(mcp_server=True)