MoDA-PLUS

Running on Zero

App Files Files Community

seawolf2357 commited on 10 days ago

Commit

3e242b8

verified ·

1 Parent(s): f1a281a

Update app.py

Browse files

Files changed (1) hide show

app.py +189 -19

app.py CHANGED Viewed

@@ -10,6 +10,9 @@ from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, Revis
 from pathlib import Path
 import tempfile
 from pydub import AudioSegment
 # Add the src directory to the system path to allow for local imports
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
@@ -83,11 +86,11 @@ def ensure_wav_format(audio_path):
         # Create a temporary WAV file
         with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
             wav_path = tmp_file.name
-            # Export as WAV with standard settings
             audio.export(
                 wav_path,
                 format='wav',
-                parameters=["-ar", "16000", "-ac", "1"]  # 16kHz, mono - adjust if your model needs different settings
             )
         print(f"Audio converted successfully to: {wav_path}")
@@ -97,6 +100,88 @@ def ensure_wav_format(audio_path):
         print(f"Error converting audio: {e}")
         raise gr.Error(f"Failed to convert audio file to WAV format. Error: {e}")
 # --- Initialization ---
 # Create output directory if it doesn't exist
 os.makedirs(OUTPUT_DIR, exist_ok=True)
@@ -120,10 +205,20 @@ except Exception as e:
 emo_name_to_id = {v: k for k, v in emo_map.items()}
 # --- Core Generation Function ---
-@spaces.GPU(duration=120)
-def generate_motion(source_image_path, driving_audio_path, emotion_name, cfg_scale, progress=gr.Progress(track_tqdm=True)):
     """
     The main function that takes Gradio inputs and generates the talking head video.
     """
     if pipeline is None:
         raise gr.Error("Pipeline failed to initialize. Check the console logs for details.")
@@ -135,7 +230,7 @@ def generate_motion(source_image_path, driving_audio_path, emotion_name, cfg_sca
     start_time = time.time()
-    # Ensure audio is in WAV format
     wav_audio_path = ensure_wav_format(driving_audio_path)
     temp_wav_created = wav_audio_path != driving_audio_path
@@ -153,6 +248,8 @@ def generate_motion(source_image_path, driving_audio_path, emotion_name, cfg_sca
     print(f"  Driving Audio (WAV): {wav_audio_path}")
     print(f"  Emotion: {emotion_name} (ID: {emotion_id})")
     print(f"  CFG Scale: {cfg_scale}")
     try:
         # Call the pipeline's inference method with the WAV audio
@@ -162,9 +259,15 @@ def generate_motion(source_image_path, driving_audio_path, emotion_name, cfg_sca
             cfg_scale=float(cfg_scale),
             emo=emotion_id,
             save_dir=".",
-            smooth=False, # Smoothing can be slow, disable for a faster demo
             silent_audio_path=DEFAULT_SILENT_AUDIO_PATH,
         )
     except Exception as e:
         print(f"An error occurred during video generation: {e}")
         import traceback
@@ -180,7 +283,6 @@ def generate_motion(source_image_path, driving_audio_path, emotion_name, cfg_sca
                 print(f"Warning: Could not delete temporary file {wav_audio_path}: {e}")
     end_time = time.time()
     processing_time = end_time - start_time
     result_video_path = Path(result_video_path)
@@ -197,7 +299,8 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
         """
         <div align='center'>
             <h1>MoDA: Multi-modal Diffusion Architecture for Talking Head Generation</h1>
-            <p style="display:flex">
                 <a href='https://lixinyyang.github.io/MoDA.github.io/'><img src='https://img.shields.io/badge/Project-Page-blue'></a>
                 <a href='https://arxiv.org/abs/2507.03256'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
                 <a href='https://github.com/lixinyyang/MoDA/'><img src='https://img.shields.io/badge/Code-Github-green'></a>
@@ -208,8 +311,14 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
     with gr.Row(variant="panel"):
         with gr.Column(scale=1):
             with gr.Row():
-                source_image = gr.Image(label="Source Image", type="filepath", value="src/examples/reference_images/7.jpg")
             with gr.Row():
                 driving_audio = gr.Audio(
@@ -218,38 +327,99 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
                     value="src/examples/driving_audios/5.wav"
                 )
             with gr.Row():
                 emotion_dropdown = gr.Dropdown(
                     label="Emotion",
                     choices=list(emo_map.values()),
-                    value="None"
                 )
             with gr.Row():
                 cfg_slider = gr.Slider(
-                    label="CFG Scale",
-                    minimum=1.0,
-                    maximum=3.0,
-                    step=0.05,
-                    value=1.2
                 )
-            submit_button = gr.Button("Generate Video", variant="primary")
         with gr.Column(scale=1):
             output_video = gr.Video(label="Generated Video")
     gr.Markdown(
         """
         ---
-        ### **Disclaimer**
-        This project is intended for academic research, and we explicitly disclaim any responsibility for user-generated content. Users are solely liable for their actions while using this generative model.
         """
     )
     submit_button.click(
         fn=generate_motion,
-        inputs=[source_image, driving_audio, emotion_dropdown, cfg_slider],
         outputs=output_video
     )

 from pathlib import Path
 import tempfile
 from pydub import AudioSegment
+import cv2
+import numpy as np
+from scipy import interpolate
 # Add the src directory to the system path to allow for local imports
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
         # Create a temporary WAV file
         with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
             wav_path = tmp_file.name
+            # Export as WAV with higher sampling rate for better quality
             audio.export(
                 wav_path,
                 format='wav',
+                parameters=["-ar", "24000", "-ac", "1"]  # 24kHz, mono for better lip-sync
             )
         print(f"Audio converted successfully to: {wav_path}")
         print(f"Error converting audio: {e}")
         raise gr.Error(f"Failed to convert audio file to WAV format. Error: {e}")
+# --- Frame Interpolation Function ---
+def interpolate_frames(video_path, target_fps=30):
+    """
+    Interpolates frames in a video to achieve smoother motion.
+    Args:
+        video_path: Path to the input video
+        target_fps: Target frames per second
+    Returns:
+        Path to the interpolated video
+    """
+    try:
+        video_path = str(video_path)
+        cap = cv2.VideoCapture(video_path)
+        # Get original video properties
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        print(f"Original FPS: {original_fps}, Target FPS: {target_fps}")
+        # If target FPS is not higher, return original
+        if original_fps >= target_fps:
+            cap.release()
+            print("Target FPS is not higher than original. Skipping interpolation.")
+            return video_path
+        # Read all frames
+        frames = []
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frames.append(frame)
+        cap.release()
+        if len(frames) < 2:
+            print("Not enough frames for interpolation.")
+            return video_path
+        # Calculate interpolation factor
+        interpolation_factor = int(target_fps / original_fps)
+        interpolated_frames = []
+        print(f"Interpolating with factor: {interpolation_factor}")
+        # Perform frame interpolation
+        for i in range(len(frames) - 1):
+            interpolated_frames.append(frames[i])
+            # Generate intermediate frames
+            for j in range(1, interpolation_factor):
+                alpha = j / interpolation_factor
+                # Use weighted average for simple interpolation
+                interpolated_frame = cv2.addWeighted(
+                    frames[i], 1 - alpha,
+                    frames[i + 1], alpha,
+                    0
+                )
+                interpolated_frames.append(interpolated_frame)
+        # Add the last frame
+        interpolated_frames.append(frames[-1])
+        # Save the interpolated video
+        output_path = video_path.replace('.mp4', '_interpolated.mp4')
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, target_fps, (width, height))
+        for frame in interpolated_frames:
+            out.write(frame)
+        out.release()
+        print(f"Interpolated video saved to: {output_path}")
+        return output_path
+    except Exception as e:
+        print(f"Error during frame interpolation: {e}")
+        return video_path  # Return original if interpolation fails
 # --- Initialization ---
 # Create output directory if it doesn't exist
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 emo_name_to_id = {v: k for k, v in emo_map.items()}
 # --- Core Generation Function ---
+@spaces.GPU(duration=180)  # Increased duration for smoothing and interpolation
+def generate_motion(source_image_path, driving_audio_path, emotion_name,
+                   cfg_scale, smooth_enabled, target_fps,
+                   progress=gr.Progress(track_tqdm=True)):
     """
     The main function that takes Gradio inputs and generates the talking head video.
+    Args:
+        source_image_path: Path to the source image
+        driving_audio_path: Path to the driving audio
+        emotion_name: Selected emotion
+        cfg_scale: CFG scale for generation
+        smooth_enabled: Whether to enable smoothing
+        target_fps: Target frames per second for interpolation
     """
     if pipeline is None:
         raise gr.Error("Pipeline failed to initialize. Check the console logs for details.")
     start_time = time.time()
+    # Ensure audio is in WAV format with optimal sampling rate
     wav_audio_path = ensure_wav_format(driving_audio_path)
     temp_wav_created = wav_audio_path != driving_audio_path
     print(f"  Driving Audio (WAV): {wav_audio_path}")
     print(f"  Emotion: {emotion_name} (ID: {emotion_id})")
     print(f"  CFG Scale: {cfg_scale}")
+    print(f"  Smoothing: {smooth_enabled}")
+    print(f"  Target FPS: {target_fps}")
     try:
         # Call the pipeline's inference method with the WAV audio
             cfg_scale=float(cfg_scale),
             emo=emotion_id,
             save_dir=".",
+            smooth=smooth_enabled,  # Use the checkbox value
             silent_audio_path=DEFAULT_SILENT_AUDIO_PATH,
         )
+        # Apply frame interpolation if requested
+        if target_fps > 24:  # Assuming default is around 24 FPS
+            print(f"Applying frame interpolation to achieve {target_fps} FPS...")
+            result_video_path = interpolate_frames(result_video_path, target_fps=target_fps)
     except Exception as e:
         print(f"An error occurred during video generation: {e}")
         import traceback
                 print(f"Warning: Could not delete temporary file {wav_audio_path}: {e}")
     end_time = time.time()
     processing_time = end_time - start_time
     result_video_path = Path(result_video_path)
         """
         <div align='center'>
             <h1>MoDA: Multi-modal Diffusion Architecture for Talking Head Generation</h1>
+            <h2 style="color: #4A90E2;">Enhanced Version with Smooth Motion</h2>
+            <p style="display:flex; justify-content: center; gap: 10px;">
                 <a href='https://lixinyyang.github.io/MoDA.github.io/'><img src='https://img.shields.io/badge/Project-Page-blue'></a>
                 <a href='https://arxiv.org/abs/2507.03256'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
                 <a href='https://github.com/lixinyyang/MoDA/'><img src='https://img.shields.io/badge/Code-Github-green'></a>
     with gr.Row(variant="panel"):
         with gr.Column(scale=1):
+            gr.Markdown("### 📥 Input Settings")
             with gr.Row():
+                source_image = gr.Image(
+                    label="Source Image",
+                    type="filepath",
+                    value="src/examples/reference_images/7.jpg"
+                )
             with gr.Row():
                 driving_audio = gr.Audio(
                     value="src/examples/driving_audios/5.wav"
                 )
+            gr.Markdown("### ⚙️ Generation Settings")
             with gr.Row():
                 emotion_dropdown = gr.Dropdown(
                     label="Emotion",
                     choices=list(emo_map.values()),
+                    value="None",
+                    info="Select an emotion for more natural facial expressions"
                 )
             with gr.Row():
                 cfg_slider = gr.Slider(
+                    label="CFG Scale (Lower = Smoother motion)",
+                    minimum=0.5,
+                    maximum=5.0,
+                    step=0.1,
+                    value=1.0,
+                    info="Lower values produce smoother but less controlled motion"
+                )
+            gr.Markdown("### 🎬 Motion Enhancement")
+            with gr.Row():
+                smooth_checkbox = gr.Checkbox(
+                    label="Enable Smoothing",
+                    value=True,
+                    info="Enables frame smoothing for more natural motion (increases processing time)"
                 )
+            with gr.Row():
+                fps_slider = gr.Slider(
+                    label="Target FPS",
+                    minimum=24,
+                    maximum=60,
+                    step=6,
+                    value=30,
+                    info="Higher FPS for smoother motion (uses frame interpolation)"
+                )
+            submit_button = gr.Button("🎥 Generate Video", variant="primary", size="lg")
         with gr.Column(scale=1):
+            gr.Markdown("### 📺 Output")
             output_video = gr.Video(label="Generated Video")
+            # Processing status
+            with gr.Row():
+                gr.Markdown(
+                    """
+                    <div style="background-color: #f0f8ff; padding: 10px; border-radius: 5px; margin-top: 10px;">
+                        <p style="margin: 0; font-size: 0.9em;">
+                        <b>Tips for best results:</b><br>
+                        • Use high-quality front-facing images<br>
+                        • Clear audio without background noise<br>
+                        • Enable smoothing for natural motion<br>
+                        • Adjust CFG scale if motion seems stiff
+                        </p>
+                    </div>
+                    """
+                )
     gr.Markdown(
         """
         ---
+        ### ⚠️ **Disclaimer**
+        This project is intended for academic research, and we explicitly disclaim any responsibility for user-generated content.
+        Users are solely liable for their actions while using this generative model.
+        ### 🚀 **Enhancement Features**
+        - **Frame Smoothing**: Reduces jitter and improves transition between frames
+        - **Frame Interpolation**: Increases FPS for smoother motion
+        - **Optimized Audio Processing**: Better lip-sync with 24kHz sampling
+        - **Fine-tuned CFG Scale**: Better control over motion naturalness
         """
     )
+    # Examples section
+    gr.Examples(
+        examples=[
+            ["src/examples/reference_images/7.jpg", "src/examples/driving_audios/5.wav", "None", 1.0, True, 30],
+            ["src/examples/reference_images/7.jpg", "src/examples/driving_audios/5.wav", "Happy", 0.8, True, 30],
+            ["src/examples/reference_images/7.jpg", "src/examples/driving_audios/5.wav", "Sad", 1.2, True, 24],
+        ],
+        inputs=[source_image, driving_audio, emotion_dropdown, cfg_slider, smooth_checkbox, fps_slider],
+        outputs=output_video,
+        fn=generate_motion,
+        cache_examples=False,
+        label="Example Configurations"
+    )
     submit_button.click(
         fn=generate_motion,
+        inputs=[source_image, driving_audio, emotion_dropdown, cfg_slider, smooth_checkbox, fps_slider],
         outputs=output_video
     )