MoDA-PLUS

Running on Zero

App Files Files Community

seawolf2357 commited on 10 days ago

Commit

df2ef41

verified ·

1 Parent(s): af5108f

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -6

app.py CHANGED Viewed

@@ -121,6 +121,11 @@ def interpolate_frames(video_path, target_fps=30):
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         print(f"Original FPS: {original_fps}, Target FPS: {target_fps}")
         # If target FPS is not higher, return original
@@ -204,6 +209,25 @@ except Exception as e:
 # Invert the emo_map for easy lookup from the dropdown value
 emo_name_to_id = {v: k for k, v in emo_map.items()}
 # --- Core Generation Function ---
 @spaces.GPU(duration=180)  # Increased duration for smoothing and interpolation
 def generate_motion(source_image_path, driving_audio_path, emotion_name,
@@ -227,6 +251,15 @@ def generate_motion(source_image_path, driving_audio_path, emotion_name,
         raise gr.Error("Please upload a source image.")
     if driving_audio_path is None:
         raise gr.Error("Please upload a driving audio file.")
     start_time = time.time()
@@ -341,7 +374,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
             with gr.Row():
                 driving_audio = gr.Audio(
-                    label="Driving Audio",
                     type="filepath",
                     value="src/examples/driving_audios/5.wav"
                 )
@@ -352,7 +385,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
                 emotion_dropdown = gr.Dropdown(
                     label="Emotion",
                     choices=list(emo_map.values()),
-                    value="Neutral",
                     info="Select an emotion for more natural facial expressions"
                 )
@@ -371,7 +404,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
             with gr.Row():
                 smooth_checkbox = gr.Checkbox(
                     label="Enable Smoothing (Experimental)",
-                    value=True,  # Changed to False due to CUDA issues
                     info="May cause errors on some systems. If errors occur, disable this option."
                 )
@@ -400,8 +433,9 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
                         <b>Tips for best results:</b><br>
                         • Use high-quality front-facing images<br>
                         • Clear audio without background noise<br>
-                        • Enable smoothing for natural motion<br>
-                        • Adjust CFG scale if motion seems stiff
                         </p>
                     </div>
                     """
@@ -415,10 +449,16 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
         Users are solely liable for their actions while using this generative model.
         ### 🚀 **Enhancement Features**
-        - **Frame Smoothing**: Reduces jitter and improves transition between frames
         - **Frame Interpolation**: Increases FPS for smoother motion
         - **Optimized Audio Processing**: Better lip-sync with 24kHz sampling
         - **Fine-tuned CFG Scale**: Better control over motion naturalness
         """
     )

         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        # Fix for FPS detection issue
+        if original_fps == 0 or original_fps is None:
+            print("Warning: Could not detect original FPS. Assuming 25 FPS.")
+            original_fps = 25.0
         print(f"Original FPS: {original_fps}, Target FPS: {target_fps}")
         # If target FPS is not higher, return original
 # Invert the emo_map for easy lookup from the dropdown value
 emo_name_to_id = {v: k for k, v in emo_map.items()}
+# --- Audio Length Check Function ---
+def check_audio_length(audio_path):
+    """
+    Check the length of an audio file and warn if it's too long.
+    Args:
+        audio_path: Path to the audio file
+    Returns:
+        Duration in seconds
+    """
+    try:
+        audio = AudioSegment.from_file(audio_path)
+        duration_seconds = len(audio) / 1000.0
+        return duration_seconds
+    except Exception as e:
+        print(f"Error checking audio length: {e}")
+        return None
 # --- Core Generation Function ---
 @spaces.GPU(duration=180)  # Increased duration for smoothing and interpolation
 def generate_motion(source_image_path, driving_audio_path, emotion_name,
         raise gr.Error("Please upload a source image.")
     if driving_audio_path is None:
         raise gr.Error("Please upload a driving audio file.")
+    # Check audio length
+    audio_duration = check_audio_length(driving_audio_path)
+    if audio_duration:
+        print(f"Audio duration: {audio_duration:.1f} seconds")
+        if audio_duration > 60:
+            gr.Warning(f"⚠️ Audio is {audio_duration:.1f} seconds long. MoDA works best with audio under 60 seconds. Processing may be slow and quality may degrade.")
+        if audio_duration > 180:
+            raise gr.Error("Audio is too long. Please use audio files under 3 minutes (180 seconds) for best results.")
     start_time = time.time()
             with gr.Row():
                 driving_audio = gr.Audio(
+                    label="Driving Audio (Recommended: < 60 seconds)",
                     type="filepath",
                     value="src/examples/driving_audios/5.wav"
                 )
                 emotion_dropdown = gr.Dropdown(
                     label="Emotion",
                     choices=list(emo_map.values()),
+                    value="None",
                     info="Select an emotion for more natural facial expressions"
                 )
             with gr.Row():
                 smooth_checkbox = gr.Checkbox(
                     label="Enable Smoothing (Experimental)",
+                    value=False,  # Changed to False due to CUDA issues
                     info="May cause errors on some systems. If errors occur, disable this option."
                 )
                         <b>Tips for best results:</b><br>
                         • Use high-quality front-facing images<br>
                         • Clear audio without background noise<br>
+                        • <b>Keep audio under 60 seconds</b><br>
+                        • Adjust CFG scale if motion seems stiff<br>
+                        • For longer audio, split into segments
                         </p>
                     </div>
                     """
         Users are solely liable for their actions while using this generative model.
         ### 🚀 **Enhancement Features**
+        - **Frame Smoothing**: Reduces jitter and improves transition between frames (currently experimental)
         - **Frame Interpolation**: Increases FPS for smoother motion
         - **Optimized Audio Processing**: Better lip-sync with 24kHz sampling
         - **Fine-tuned CFG Scale**: Better control over motion naturalness
+        ### ⏱️ **Audio Length Limitations**
+        - **Optimal**: Under 30 seconds for best quality and speed
+        - **Recommended**: Under 60 seconds
+        - **Maximum**: 180 seconds (3 minutes) - very slow processing
+        - For longer content, consider splitting audio into segments
         """
     )