wan-2-2-first-last-frame-diffuser

Paused

App Files Files Community

multimodalart HF Staff commited on Aug 31

Commit

9e195fc

verified ·

1 Parent(s): 7b7a87f

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -114

app.py CHANGED Viewed

@@ -37,7 +37,6 @@ hf_hub_download_local(repo_id="Kijai/WanVideo_comfy", filename="Wan22-Lightning/
 print("Downloads complete.")
 # --- Boilerplate code from the original script ---
 def get_value_at_index(obj: Union[Sequence, Mapping], index: int) -> Any:
     """Returns the value at the given index of a sequence or mapping.
@@ -88,7 +87,6 @@ def add_comfyui_directory_to_sys_path() -> None:
     """
     Add 'ComfyUI' to the sys.path
     """
-    # Use a more robust name to find the ComfyUI directory
     comfyui_path = find_path("ComfyUI")
     if comfyui_path is not None and os.path.isdir(comfyui_path):
         sys.path.append(comfyui_path)
@@ -132,8 +130,6 @@ def import_custom_nodes() -> None:
 # --- Model Loading and Caching ---
-# Dictionary to hold all loaded models and node instances
 MODELS_AND_NODES = {}
 print("Setting up ComfyUI paths...")
@@ -215,12 +211,21 @@ print("All models loaded successfully!")
 # --- Main Video Generation Logic ---
 @spaces.GPU(duration=120)
-def generate_video(start_image_pil: Image.Image, end_image_pil: Image.Image, prompt: str, negative_prompt: str, progress=gr.Progress(track_tqdm=True)):
     """
     The main function to generate a video based on user inputs.
     This function is called every time the user clicks the 'Generate' button.
     """
-    # Use pre-loaded models and nodes from the global dictionary
     clip = MODELS_AND_NODES["clip"]
     vae = MODELS_AND_NODES["vae"]
     model_low_noise = MODELS_AND_NODES["model_low_noise"]
@@ -246,122 +251,127 @@ def generate_video(start_image_pil: Image.Image, end_image_pil: Image.Image, pro
         start_image_path = start_file.name
         end_image_path = end_file.name
-    try:
-        with torch.inference_mode():
-            progress(0.1, desc="Encoding text and images...")
-            # --- Workflow execution ---
-            positive_conditioning = cliptextencode.encode(text=prompt, clip=get_value_at_index(clip, 0))
-            negative_conditioning = cliptextencode.encode(text=negative_prompt, clip=get_value_at_index(clip, 0))
-            start_image_loaded = loadimage.load_image(image=start_image_path)
-            end_image_loaded = loadimage.load_image(image=end_image_path)
-            clip_vision_encoded_start = clipvisionencode.encode(
-                crop="none", clip_vision=get_value_at_index(clip_vision, 0), image=get_value_at_index(start_image_loaded, 0)
-            )
-            clip_vision_encoded_end = clipvisionencode.encode(
-                crop="none", clip_vision=get_value_at_index(clip_vision, 0), image=get_value_at_index(end_image_loaded, 0)
-            )
-            progress(0.2, desc="Preparing initial latents...")
-            initial_latents = wanfirstlastframetovideo.EXECUTE_NORMALIZED(
-                width=480, height=480, length=33, batch_size=1,
-                positive=get_value_at_index(positive_conditioning, 0),
-                negative=get_value_at_index(negative_conditioning, 0),
-                vae=get_value_at_index(vae, 0),
-                clip_vision_start_image=get_value_at_index(clip_vision_encoded_start, 0),
-                clip_vision_end_image=get_value_at_index(clip_vision_encoded_end, 0),
-                start_image=get_value_at_index(start_image_loaded, 0),
-                end_image=get_value_at_index(end_image_loaded, 0),
-            )
-            progress(0.3, desc="Patching models...")
-            model_low_patched = modelsamplingsd3.patch(shift=8, model=get_value_at_index(model_low_noise, 0))
-            model_low_final = pathchsageattentionkj.patch(sage_attention="auto", model=get_value_at_index(model_low_patched, 0))
-            model_high_patched = modelsamplingsd3.patch(shift=8, model=get_value_at_index(model_high_noise, 0))
-            model_high_final = pathchsageattentionkj.patch(sage_attention="auto", model=get_value_at_index(model_high_patched, 0))
-            progress(0.5, desc="Running KSampler (Step 1/2)...")
-            latent_step1 = ksampleradvanced.sample(
-                add_noise="enable", noise_seed=random.randint(1, 2**64), steps=8, cfg=1,
-                sampler_name="euler", scheduler="simple", start_at_step=0, end_at_step=4,
-                return_with_leftover_noise="enable", model=get_value_at_index(model_high_final, 0),
-                positive=get_value_at_index(initial_latents, 0),
-                negative=get_value_at_index(initial_latents, 1),
-                latent_image=get_value_at_index(initial_latents, 2),
-            )
-            progress(0.7, desc="Running KSampler (Step 2/2)...")
-            latent_step2 = ksampleradvanced.sample(
-                add_noise="disable", noise_seed=random.randint(1, 2**64), steps=8, cfg=1,
-                sampler_name="euler", scheduler="simple", start_at_step=4, end_at_step=10000,
-                return_with_leftover_noise="disable", model=get_value_at_index(model_low_final, 0),
-                positive=get_value_at_index(initial_latents, 0),
-                negative=get_value_at_index(initial_latents, 1),
-                latent_image=get_value_at_index(latent_step1, 0),
-            )
-            progress(0.8, desc="Decoding VAE...")
-            decoded_images = vaedecode.decode(samples=get_value_at_index(latent_step2, 0), vae=get_value_at_index(vae, 0))
-            progress(0.9, desc="Creating and saving video...")
-            video_data = createvideo.create_video(fps=16, images=get_value_at_index(decoded_images, 0))
-            # Save the video to ComfyUI's output directory
-            save_result = savevideo.save_video(
-                filename_prefix="GradioVideo", format="mp4", codec="h264",
-                video=get_value_at_index(video_data, 0),
-            )
-            progress(1.0, desc="Done!")
-            return f"output/{save_result['ui']['images'][0]['filename']}"
-    finally:
-        # Clean up the temporary image files
-        os.unlink(start_image_path)
-        os.unlink(end_image_path)
-# --- Gradio UI ---
-def create_gradio_app():
-    with gr.Blocks(theme=gr.themes.Soft()) as app:
-        gr.Markdown("# Image-to-Video Generation App")
-        gr.Markdown("Upload a start and end frame, provide a prompt, and let the AI generate a video transitioning between them.")
-        with gr.Row():
-            start_image = gr.Image(type="pil", label="Start Frame")
-            end_image = gr.Image(type="pil", label="End Frame")
-        prompt = gr.Textbox(label="Prompt", value="the guy turns")
-        negative_prompt = gr.Textbox(
-            label="Negative Prompt",
-            value="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走,过曝，"
         )
-        generate_button = gr.Button("Generate Video", variant="primary")
-        output_video = gr.Video(label="Generated Video")
-        generate_button.click(
-            fn=generate_video,
-            inputs=[start_image, end_image, prompt, negative_prompt],
-            outputs=output_video
         )
-        gr.Examples(
-            examples=[
-                ["examples/start.png", "examples/end.png", "a beautiful woman smiling"],
-                ["examples/start.png", "examples/end.png", "a robot walking through a futuristic city"],
-            ],
-            inputs=[start_image, end_image, prompt],
-            outputs=output_video,
-            fn=generate_video,
-            cache_examples=False, # Set to True if you want to pre-compute examples
         )
-    return app
 if __name__ == "__main__":
     app = create_gradio_app()

 print("Downloads complete.")
 # --- Boilerplate code from the original script ---
 def get_value_at_index(obj: Union[Sequence, Mapping], index: int) -> Any:
     """Returns the value at the given index of a sequence or mapping.
     """
     Add 'ComfyUI' to the sys.path
     """
     comfyui_path = find_path("ComfyUI")
     if comfyui_path is not None and os.path.isdir(comfyui_path):
         sys.path.append(comfyui_path)
 # --- Model Loading and Caching ---
 MODELS_AND_NODES = {}
 print("Setting up ComfyUI paths...")
 # --- Main Video Generation Logic ---
 @spaces.GPU(duration=120)
+def generate_video(
+    start_image_pil,
+    end_image_pil,
+    prompt,
+    negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走,过曝，",
+    duration=2,
+    progress=gr.Progress(track_tqdm=True)
+):
     """
     The main function to generate a video based on user inputs.
     This function is called every time the user clicks the 'Generate' button.
     """
+    FPS = 16
+    num_frames = max(2, int(duration * FPS))
     clip = MODELS_AND_NODES["clip"]
     vae = MODELS_AND_NODES["vae"]
     model_low_noise = MODELS_AND_NODES["model_low_noise"]
         start_image_path = start_file.name
         end_image_path = end_file.name
+    with torch.inference_mode():
+        progress(0.1, desc="Encoding text and images...")
+        # --- Workflow execution ---
+        positive_conditioning = cliptextencode.encode(text=prompt, clip=get_value_at_index(clip, 0))
+        negative_conditioning = cliptextencode.encode(text=negative_prompt, clip=get_value_at_index(clip, 0))
+        start_image_loaded = loadimage.load_image(image=start_image_path)
+        end_image_loaded = loadimage.load_image(image=end_image_path)
+        clip_vision_encoded_start = clipvisionencode.encode(
+            crop="none", clip_vision=get_value_at_index(clip_vision, 0), image=get_value_at_index(start_image_loaded, 0)
+        )
+        clip_vision_encoded_end = clipvisionencode.encode(
+            crop="none", clip_vision=get_value_at_index(clip_vision, 0), image=get_value_at_index(end_image_loaded, 0)
         )
+        progress(0.2, desc="Preparing initial latents...")
+        initial_latents = wanfirstlastframetovideo.EXECUTE_NORMALIZED(
+            width=480, height=480, length=num_frames, batch_size=1,
+            positive=get_value_at_index(positive_conditioning, 0),
+            negative=get_value_at_index(negative_conditioning, 0),
+            vae=get_value_at_index(vae, 0),
+            clip_vision_start_image=get_value_at_index(clip_vision_encoded_start, 0),
+            clip_vision_end_image=get_value_at_index(clip_vision_encoded_end, 0),
+            start_image=get_value_at_index(start_image_loaded, 0),
+            end_image=get_value_at_index(end_image_loaded, 0),
         )
+        progress(0.3, desc="Patching models...")
+        model_low_patched = modelsamplingsd3.patch(shift=8, model=get_value_at_index(model_low_noise, 0))
+        model_low_final = pathchsageattentionkj.patch(sage_attention="auto", model=get_value_at_index(model_low_patched, 0))
+        model_high_patched = modelsamplingsd3.patch(shift=8, model=get_value_at_index(model_high_noise, 0))
+        model_high_final = pathchsageattentionkj.patch(sage_attention="auto", model=get_value_at_index(model_high_patched, 0))
+        progress(0.5, desc="Running KSampler (Step 1/2)...")
+        latent_step1 = ksampleradvanced.sample(
+            add_noise="enable", noise_seed=random.randint(1, 2**64), steps=8, cfg=1,
+            sampler_name="euler", scheduler="simple", start_at_step=0, end_at_step=4,
+            return_with_leftover_noise="enable", model=get_value_at_index(model_high_final, 0),
+            positive=get_value_at_index(initial_latents, 0),
+            negative=get_value_at_index(initial_latents, 1),
+            latent_image=get_value_at_index(initial_latents, 2),
+        )
+        progress(0.7, desc="Running KSampler (Step 2/2)...")
+        latent_step2 = ksampleradvanced.sample(
+            add_noise="disable", noise_seed=random.randint(1, 2**64), steps=8, cfg=1,
+            sampler_name="euler", scheduler="simple", start_at_step=4, end_at_step=10000,
+            return_with_leftover_noise="disable", model=get_value_at_index(model_low_final, 0),
+            positive=get_value_at_index(initial_latents, 0),
+            negative=get_value_at_index(initial_latents, 1),
+            latent_image=get_value_at_index(latent_step1, 0),
         )
+        progress(0.8, desc="Decoding VAE...")
+        decoded_images = vaedecode.decode(samples=get_value_at_index(latent_step2, 0), vae=get_value_at_index(vae, 0))
+        progress(0.9, desc="Creating and saving video...")
+        video_data = createvideo.create_video(fps=FPS, images=get_value_at_index(decoded_images, 0))
+        # Save the video to ComfyUI's output directory
+        save_result = savevideo.save_video(
+            filename_prefix="GradioVideo", format="mp4", codec="h264",
+            video=get_value_at_index(video_data, 0),
+        )
+        progress(1.0, desc="Done!")
+        return f"output/{save_result['ui']['images'][0]['filename']}"
+css = '''
+.fillable{max-width: 980px !important}
+.dark .progress-text {color: white}
+'''
+with gr.Blocks(theme=gr.themes.Citrus(), css=css) as app:
+    gr.Markdown("# Wan 2.2 First/Last Frame Video Fast")
+    gr.Markdown("Running the [Wan 2.2 First/Last Frame ComfyUI workflow](https://www.reddit.com/r/StableDiffusion/comments/1me4306/psa_wan_22_does_first_frame_last_frame_out_of_the/) on ZeroGPU")
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                start_image = gr.Image(type="pil", label="Start Frame")
+                end_image = gr.Image(type="pil", label="End Frame")
+            prompt = gr.Textbox(label="Prompt", info="Describe the transition between the two images", value="transition")
+            with gr.Accordion("Advanced Settings", open=False):
+                duration = gr.Slider(
+                    minimum=1.0,
+                    maximum=5.0,
+                    value=2.0,
+                    step=0.1,
+                    label="Video Duration (seconds)",
+                    info="Longer videos take longer to generate"
+                )
+                negative_prompt = gr.Textbox(
+                    label="Negative Prompt",
+                    value="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走,过曝，",
+                    visible=False
+                )
+            generate_button = gr.Button("Generate Video", variant="primary")
+        with gr.Column():
+            output_video = gr.Video(label="Generated Video")
+    generate_button.click(
+        fn=generate_video,
+        inputs=[start_image, end_image, prompt, negative_prompt, duration],
+        outputs=output_video
+    )
+    gr.Examples(
+        examples=[
+            ["poli_tower.png", "tower_takes_off.png", "the man turns"],
+            ["capybara_zoomed.png", "capybara.webp", "a dramatic dolly zoom"],
+        ],
+        inputs=[start_image, end_image, prompt],
+        outputs=output_video,
+        fn=generate_video,
+        cache_examples="lazy",
+    )
 if __name__ == "__main__":
     app = create_gradio_app()