Spaces:

multimodalart
/

self-forcing

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 15 days ago

Commit

1b86783

verified ·

1 Parent(s): d7a915c

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -101

app.py CHANGED Viewed

@@ -24,6 +24,7 @@ import random
 import argparse
 import hashlib
 import urllib.request
 from PIL import Image
 import spaces
 import numpy as np
@@ -31,27 +32,23 @@ import torch
 import gradio as gr
 from omegaconf import OmegaConf
 from tqdm import tqdm
-import imageio # Added for final video rendering
-# FastRTC imports
-from fastrtc import WebRTC, get_cloudflare_turn_credentials
-from fastrtc.utils import AdditionalOutputs #, CloseStream
 # Original project imports
 from pipeline import CausalInferencePipeline
 from demo_utils.constant import ZERO_VAE_CACHE
 from demo_utils.vae_block3 import VAEDecoderWrapper
 from utils.wan_wrapper import WanDiffusionWrapper, WanTextEncoder
-# from demo_utils.memory import gpu, get_cuda_free_memory_gb, DynamicSwapInstaller
 # --- Argument Parsing ---
-parser = argparse.ArgumentParser(description="Gradio Demo for Self-Forcing with FastRTC")
 parser.add_argument('--port', type=int, default=7860, help="Port to run the Gradio app on.")
 parser.add_argument('--host', type=str, default='0.0.0.0', help="Host to bind the Gradio app to.")
 parser.add_argument("--checkpoint_path", type=str, default='./checkpoints/self_forcing_dmd.pt', help="Path to the model checkpoint.")
 parser.add_argument("--config_path", type=str, default='./configs/self_forcing_dmd.yaml', help="Path to the model config.")
 parser.add_argument('--share', action='store_true', help="Create a public Gradio link.")
 parser.add_argument('--trt', action='store_true', help="Use TensorRT optimized VAE decoder.")
 args = parser.parse_args()
 gpu = "cuda"
@@ -146,24 +143,22 @@ pipeline = CausalInferencePipeline(
 pipeline.to(dtype=torch.float16).to(gpu)
-# --- Additional Outputs Handler ---
-def handle_additional_outputs(status_html_update, video_update, webrtc_output):
-    return status_html_update, video_update, webrtc_output
-# --- FastRTC Video Generation Handler ---
 @torch.no_grad()
 @spaces.GPU
-def video_generation_handler(prompt, seed, progress=gr.Progress()):
     """
-    Generator function that yields BGR NumPy frames for real-time streaming.
-    Returns cleanly when done - no infinite loops.
     """
     if seed == -1:
         seed = random.randint(0, 2**32 - 1)
     print(f"🎬 Starting video generation with prompt: '{prompt}' and seed: {seed}")
     print("🔤 Encoding text prompt...")
     conditional_dict = text_encoder(text_prompts=[prompt])
     for key, value in conditional_dict.items():
@@ -184,7 +179,7 @@ def video_generation_handler(prompt, seed, progress=gr.Progress()):
     all_num_frames = [pipeline.num_frame_per_block] * num_blocks
     total_frames_yielded = 0
-    all_frames_for_video = [] # To collect frames for final video
     for idx, current_num_frames in enumerate(all_num_frames):
         print(f"📦 Processing block {idx+1}/{num_blocks} with {current_num_frames} frames")
@@ -235,7 +230,7 @@ def video_generation_handler(prompt, seed, progress=gr.Progress()):
         print(f"📹 Decoded pixels shape: {pixels.shape}")
-        # Yield individual frames WITH status updates
         for frame_idx in range(pixels.shape[1]):
             frame_tensor = pixels[0, frame_idx]  # Get single frame [C, H, W]
@@ -243,73 +238,47 @@ def video_generation_handler(prompt, seed, progress=gr.Progress()):
             frame_np = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
             frame_np = frame_np.to(torch.uint8).cpu().numpy()
-            # Convert from CHW to HWC format
             frame_np = np.transpose(frame_np, (1, 2, 0))  # CHW -> HWC
             all_frames_for_video.append(frame_np)
-            # Convert RGB to BGR for FastRTC (OpenCV format)
-            frame_bgr = frame_np[:, :, ::-1]  # RGB -> BGR
             total_frames_yielded += 1
-            print(f"📺 Yielding frame {total_frames_yielded}: shape {frame_bgr.shape}, dtype {frame_bgr.dtype}")
             # Calculate progress
             total_expected_frames = num_blocks * pipeline.num_frame_per_block
             current_frame_count = (idx * pipeline.num_frame_per_block) + frame_idx + 1
-            frame_progress = 100 * (current_frame_count / total_expected_frames)
-            # --- REVISED HTML START ---
-            if frame_idx == pixels.shape[1] - 1 and idx + 1 == num_blocks: # last frame
-                status_html = (
-                    f"<div style='padding: 16px; border: 1px solid #198754; background-color: #d1e7dd; border-radius: 8px; font-family: sans-serif; text-align: center;'>"
-                    f"  <h4 style='margin: 0 0 8px 0; color: #0f5132; font-size: 18px;'>🎉 Generation Complete!</h4>"
-                    f"  <p style='margin: 0; color: #0f5132;'>"
-                    f"    Total frames: {total_frames_yielded}. The final video is now available."
-                    f"  </p>"
-                    f"</div>"
-                )
-                print("💾 Saving final rendered video...")
-                video_update = gr.update() # Default to no-op
-                try:
-                    video_path = f"gradio_tmp/{seed}_{hashlib.md5(prompt.encode()).hexdigest()}.mp4"
-                    imageio.mimwrite(video_path, all_frames_for_video, fps=15, quality=8)
-                    print(f"✅ Video saved to {video_path}")
-                    video_update = gr.update(value=video_path, visible=True)
-                except Exception as e:
-                    print(f"⚠️ Could not save final video: {e}")
-                yield frame_bgr, AdditionalOutputs(status_html, video_update, gr.update(visible=False))
-                # yield CloseStream("🎉 Video generation completed successfully!")
-                return
-            else:  # Regular frames - simpler status
-                status_html = (
-                    f"<div style='padding: 10px; border: 1px solid #ddd; border-radius: 8px; font-family: sans-serif;'>"
-                    f"  <p style='margin: 0 0 8px 0; font-size: 16px; font-weight: bold;'>Generating Video...</p>"
-                    f"  <div style='background: #e9ecef; border-radius: 4px; width: 100%; overflow: hidden;'>"
-                    f"    <div style='width: {frame_progress:.1f}%; height: 20px; background-color: #0d6efd; transition: width 0.2s;'></div>"
-                    f"  </div>"
-                    f"  <p style='margin: 8px 0 0 0; color: #555; font-size: 14px; text-align: right;'>"
-                    f"    Block {idx+1}/{num_blocks}   |   Frame {total_frames_yielded}   |   {frame_progress:.1f}%"
-                    f"  </p>"
-                    f"</div>"
-                )
-            # --- REVISED HTML END ---
-            yield frame_bgr, AdditionalOutputs(status_html, gr.update(visible=False), gr.update(visible=True))
         current_start_frame += current_num_frames
     print(f"✅ Video generation completed! Total frames yielded: {total_frames_yielded}")
-    # Signal completion
-    # yield CloseStream("🎉 Video generation completed successfully!")
 # --- Gradio UI Layout ---
-with gr.Blocks(theme=gr.themes.Soft(), title="Self-Forcing FastRTC Demo") as demo:
-    gr.Markdown("# 🚀 Self-Forcing Video Generation with FastRTC Streaming")
-    gr.Markdown("*Real-time video generation streaming via WebRTC*")
     with gr.Row():
         with gr.Column(scale=2):
@@ -332,47 +301,42 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Self-Forcing FastRTC Demo") as dem
             with gr.Row():
                 seed = gr.Number(label="Seed", value=-1, info="Use -1 for a random seed.")
-            with gr.Accordion("⚙️ Performance Options", open=False):
-                gr.Markdown("*These optimizations are applied once per session*")
             start_btn = gr.Button("🎬 Start Generation", variant="primary", size="lg")
         with gr.Column(scale=3):
-            gr.Markdown("### 📺 Live Video Stream")
-            gr.Markdown("*Click 'Start Generation' to begin streaming*")
-            webrtc_output = WebRTC(
-                label="Generated Video Stream",
-                modality="video",
-                mode="receive",  # Server sends video to client
                 height=480,
                 width=832,
-                rtc_configuration=get_cloudflare_turn_credentials(),
-                elem_id="video_stream"
             )
-            final_video = gr.Video(label="Final Rendered Video", visible=False, interactive=False)
-            status_html = gr.HTML(
-                value="<div style='text-align: center; padding: 20px; color: #666;'>Ready to start generation...</div>",
-                label="Generation Status"
             )
-    # Connect the generator to the WebRTC stream
-    webrtc_output.stream(
-        fn=video_generation_handler,
-        inputs=[prompt, seed],
-        outputs=[webrtc_output],
-        time_limit=300,  # 5 minutes max
-        trigger=start_btn.click,
-    )
-    # MODIFIED: Handle additional outputs (status updates AND final video)
-    webrtc_output.on_additional_outputs(
-        fn=handle_additional_outputs,
-        outputs=[status_html, final_video, webrtc_output]
     )
 # --- Launch App ---

 import argparse
 import hashlib
 import urllib.request
+import time
 from PIL import Image
 import spaces
 import numpy as np
 import gradio as gr
 from omegaconf import OmegaConf
 from tqdm import tqdm
+import imageio
 # Original project imports
 from pipeline import CausalInferencePipeline
 from demo_utils.constant import ZERO_VAE_CACHE
 from demo_utils.vae_block3 import VAEDecoderWrapper
 from utils.wan_wrapper import WanDiffusionWrapper, WanTextEncoder
 # --- Argument Parsing ---
+parser = argparse.ArgumentParser(description="Gradio Demo for Self-Forcing with Frame Streaming")
 parser.add_argument('--port', type=int, default=7860, help="Port to run the Gradio app on.")
 parser.add_argument('--host', type=str, default='0.0.0.0', help="Host to bind the Gradio app to.")
 parser.add_argument("--checkpoint_path", type=str, default='./checkpoints/self_forcing_dmd.pt', help="Path to the model checkpoint.")
 parser.add_argument("--config_path", type=str, default='./configs/self_forcing_dmd.yaml', help="Path to the model config.")
 parser.add_argument('--share', action='store_true', help="Create a public Gradio link.")
 parser.add_argument('--trt', action='store_true', help="Use TensorRT optimized VAE decoder.")
+parser.add_argument('--fps', type=float, default=15.0, help="Playback FPS for frame streaming.")
 args = parser.parse_args()
 gpu = "cuda"
 pipeline.to(dtype=torch.float16).to(gpu)
+# --- Frame Streaming Video Generation Handler ---
 @torch.no_grad()
 @spaces.GPU
+def video_generation_handler(prompt, seed, fps, progress=gr.Progress()):
     """
+    Generator function that yields RGB frames for display in gr.Image.
+    Includes timing delays for smooth playback.
     """
     if seed == -1:
         seed = random.randint(0, 2**32 - 1)
     print(f"🎬 Starting video generation with prompt: '{prompt}' and seed: {seed}")
+    # Calculate frame delay based on FPS
+    frame_delay = 1.0 / fps if fps > 0 else 1.0 / 15.0
     print("🔤 Encoding text prompt...")
     conditional_dict = text_encoder(text_prompts=[prompt])
     for key, value in conditional_dict.items():
     all_num_frames = [pipeline.num_frame_per_block] * num_blocks
     total_frames_yielded = 0
+    all_frames_for_video = []
     for idx, current_num_frames in enumerate(all_num_frames):
         print(f"📦 Processing block {idx+1}/{num_blocks} with {current_num_frames} frames")
         print(f"📹 Decoded pixels shape: {pixels.shape}")
+        # Yield individual frames with timing delays
         for frame_idx in range(pixels.shape[1]):
             frame_tensor = pixels[0, frame_idx]  # Get single frame [C, H, W]
             frame_np = torch.clamp(frame_tensor.float(), -1., 1.) * 127.5 + 127.5
             frame_np = frame_np.to(torch.uint8).cpu().numpy()
+            # Convert from CHW to HWC format (RGB)
             frame_np = np.transpose(frame_np, (1, 2, 0))  # CHW -> HWC
             all_frames_for_video.append(frame_np)
             total_frames_yielded += 1
             # Calculate progress
             total_expected_frames = num_blocks * pipeline.num_frame_per_block
             current_frame_count = (idx * pipeline.num_frame_per_block) + frame_idx + 1
+            frame_progress = current_frame_count / total_expected_frames
+            # Update progress
+            progress(frame_progress, desc=f"Frame {total_frames_yielded} | Block {idx+1}/{num_blocks}")
+            print(f"📺 Yielding frame {total_frames_yielded}: shape {frame_np.shape}")
+            # Yield frame with timing delay
+            yield gr.update(visible=True, frame_np), gr.update(visible=False)
+            # Sleep between frames for smooth playback (except for the last frame)
+            if not (frame_idx == pixels.shape[1] - 1 and idx + 1 == num_blocks):
+                time.sleep(frame_delay)
         current_start_frame += current_num_frames
     print(f"✅ Video generation completed! Total frames yielded: {total_frames_yielded}")
+    # Save final video
+    try:
+        video_path = f"gradio_tmp/{seed}_{hashlib.md5(prompt.encode()).hexdigest()}.mp4"
+        imageio.mimwrite(video_path, all_frames_for_video, fps=fps, quality=8)
+        print(f"✅ Video saved to {video_path}")
+        return gr.update(visible=False), gr.update(value=video_path, visible=True)
+    except Exception as e:
+        print(f"⚠️ Could not save final video: {e}")
+        return None, None
 # --- Gradio UI Layout ---
+with gr.Blocks(theme=gr.themes.Soft(), title="Self-Forcing Frame Streaming Demo") as demo:
+    gr.Markdown("# 🚀 Self-Forcing Video Generation with Frame Streaming")
+    gr.Markdown("*Real-time video generation with frame-by-frame display*")
     with gr.Row():
         with gr.Column(scale=2):
             with gr.Row():
                 seed = gr.Number(label="Seed", value=-1, info="Use -1 for a random seed.")
+                fps = gr.Slider(
+                    label="Playback FPS",
+                    minimum=1,
+                    maximum=30,
+                    value=args.fps,
+                    step=1,
+                    info="Frames per second for playback"
+                )
             start_btn = gr.Button("🎬 Start Generation", variant="primary", size="lg")
         with gr.Column(scale=3):
+            gr.Markdown("### 📺 Live Frame Stream")
+            gr.Markdown("*Click 'Start Generation' to begin frame streaming*")
+            frame_display = gr.Image(
+                label="Generated Frames",
                 height=480,
                 width=832,
+                show_label=True,
+                container=True
             )
+            final_video = gr.Video(
+                label="Final Rendered Video",
+                visible=True,
+                interactive=False,
+                height=400
             )
+    # Connect the generator to the image display
+    start_btn.click(
+        fn=video_generation_handler,
+        inputs=[prompt, seed, fps],
+        outputs=[frame_display, final_video],
+        show_progress="full"
     )
 # --- Launch App ---