Update app.py
Browse files
app.py
CHANGED
@@ -121,6 +121,11 @@ def interpolate_frames(video_path, target_fps=30):
|
|
121 |
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
122 |
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
123 |
|
|
|
|
|
|
|
|
|
|
|
124 |
print(f"Original FPS: {original_fps}, Target FPS: {target_fps}")
|
125 |
|
126 |
# If target FPS is not higher, return original
|
@@ -204,6 +209,25 @@ except Exception as e:
|
|
204 |
# Invert the emo_map for easy lookup from the dropdown value
|
205 |
emo_name_to_id = {v: k for k, v in emo_map.items()}
|
206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
# --- Core Generation Function ---
|
208 |
@spaces.GPU(duration=180) # Increased duration for smoothing and interpolation
|
209 |
def generate_motion(source_image_path, driving_audio_path, emotion_name,
|
@@ -227,6 +251,15 @@ def generate_motion(source_image_path, driving_audio_path, emotion_name,
|
|
227 |
raise gr.Error("Please upload a source image.")
|
228 |
if driving_audio_path is None:
|
229 |
raise gr.Error("Please upload a driving audio file.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
start_time = time.time()
|
232 |
|
@@ -341,7 +374,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
|
|
341 |
|
342 |
with gr.Row():
|
343 |
driving_audio = gr.Audio(
|
344 |
-
label="Driving Audio",
|
345 |
type="filepath",
|
346 |
value="src/examples/driving_audios/5.wav"
|
347 |
)
|
@@ -352,7 +385,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
|
|
352 |
emotion_dropdown = gr.Dropdown(
|
353 |
label="Emotion",
|
354 |
choices=list(emo_map.values()),
|
355 |
-
value="
|
356 |
info="Select an emotion for more natural facial expressions"
|
357 |
)
|
358 |
|
@@ -371,7 +404,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
|
|
371 |
with gr.Row():
|
372 |
smooth_checkbox = gr.Checkbox(
|
373 |
label="Enable Smoothing (Experimental)",
|
374 |
-
value=
|
375 |
info="May cause errors on some systems. If errors occur, disable this option."
|
376 |
)
|
377 |
|
@@ -400,8 +433,9 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
|
|
400 |
<b>Tips for best results:</b><br>
|
401 |
• Use high-quality front-facing images<br>
|
402 |
• Clear audio without background noise<br>
|
403 |
-
•
|
404 |
-
• Adjust CFG scale if motion seems stiff
|
|
|
405 |
</p>
|
406 |
</div>
|
407 |
"""
|
@@ -415,10 +449,16 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
|
|
415 |
Users are solely liable for their actions while using this generative model.
|
416 |
|
417 |
### 🚀 **Enhancement Features**
|
418 |
-
- **Frame Smoothing**: Reduces jitter and improves transition between frames
|
419 |
- **Frame Interpolation**: Increases FPS for smoother motion
|
420 |
- **Optimized Audio Processing**: Better lip-sync with 24kHz sampling
|
421 |
- **Fine-tuned CFG Scale**: Better control over motion naturalness
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
"""
|
423 |
)
|
424 |
|
|
|
121 |
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
122 |
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
123 |
|
124 |
+
# Fix for FPS detection issue
|
125 |
+
if original_fps == 0 or original_fps is None:
|
126 |
+
print("Warning: Could not detect original FPS. Assuming 25 FPS.")
|
127 |
+
original_fps = 25.0
|
128 |
+
|
129 |
print(f"Original FPS: {original_fps}, Target FPS: {target_fps}")
|
130 |
|
131 |
# If target FPS is not higher, return original
|
|
|
209 |
# Invert the emo_map for easy lookup from the dropdown value
|
210 |
emo_name_to_id = {v: k for k, v in emo_map.items()}
|
211 |
|
212 |
+
# --- Audio Length Check Function ---
|
213 |
+
def check_audio_length(audio_path):
|
214 |
+
"""
|
215 |
+
Check the length of an audio file and warn if it's too long.
|
216 |
+
|
217 |
+
Args:
|
218 |
+
audio_path: Path to the audio file
|
219 |
+
|
220 |
+
Returns:
|
221 |
+
Duration in seconds
|
222 |
+
"""
|
223 |
+
try:
|
224 |
+
audio = AudioSegment.from_file(audio_path)
|
225 |
+
duration_seconds = len(audio) / 1000.0
|
226 |
+
return duration_seconds
|
227 |
+
except Exception as e:
|
228 |
+
print(f"Error checking audio length: {e}")
|
229 |
+
return None
|
230 |
+
|
231 |
# --- Core Generation Function ---
|
232 |
@spaces.GPU(duration=180) # Increased duration for smoothing and interpolation
|
233 |
def generate_motion(source_image_path, driving_audio_path, emotion_name,
|
|
|
251 |
raise gr.Error("Please upload a source image.")
|
252 |
if driving_audio_path is None:
|
253 |
raise gr.Error("Please upload a driving audio file.")
|
254 |
+
|
255 |
+
# Check audio length
|
256 |
+
audio_duration = check_audio_length(driving_audio_path)
|
257 |
+
if audio_duration:
|
258 |
+
print(f"Audio duration: {audio_duration:.1f} seconds")
|
259 |
+
if audio_duration > 60:
|
260 |
+
gr.Warning(f"⚠️ Audio is {audio_duration:.1f} seconds long. MoDA works best with audio under 60 seconds. Processing may be slow and quality may degrade.")
|
261 |
+
if audio_duration > 180:
|
262 |
+
raise gr.Error("Audio is too long. Please use audio files under 3 minutes (180 seconds) for best results.")
|
263 |
|
264 |
start_time = time.time()
|
265 |
|
|
|
374 |
|
375 |
with gr.Row():
|
376 |
driving_audio = gr.Audio(
|
377 |
+
label="Driving Audio (Recommended: < 60 seconds)",
|
378 |
type="filepath",
|
379 |
value="src/examples/driving_audios/5.wav"
|
380 |
)
|
|
|
385 |
emotion_dropdown = gr.Dropdown(
|
386 |
label="Emotion",
|
387 |
choices=list(emo_map.values()),
|
388 |
+
value="None",
|
389 |
info="Select an emotion for more natural facial expressions"
|
390 |
)
|
391 |
|
|
|
404 |
with gr.Row():
|
405 |
smooth_checkbox = gr.Checkbox(
|
406 |
label="Enable Smoothing (Experimental)",
|
407 |
+
value=False, # Changed to False due to CUDA issues
|
408 |
info="May cause errors on some systems. If errors occur, disable this option."
|
409 |
)
|
410 |
|
|
|
433 |
<b>Tips for best results:</b><br>
|
434 |
• Use high-quality front-facing images<br>
|
435 |
• Clear audio without background noise<br>
|
436 |
+
• <b>Keep audio under 60 seconds</b><br>
|
437 |
+
• Adjust CFG scale if motion seems stiff<br>
|
438 |
+
• For longer audio, split into segments
|
439 |
</p>
|
440 |
</div>
|
441 |
"""
|
|
|
449 |
Users are solely liable for their actions while using this generative model.
|
450 |
|
451 |
### 🚀 **Enhancement Features**
|
452 |
+
- **Frame Smoothing**: Reduces jitter and improves transition between frames (currently experimental)
|
453 |
- **Frame Interpolation**: Increases FPS for smoother motion
|
454 |
- **Optimized Audio Processing**: Better lip-sync with 24kHz sampling
|
455 |
- **Fine-tuned CFG Scale**: Better control over motion naturalness
|
456 |
+
|
457 |
+
### ⏱️ **Audio Length Limitations**
|
458 |
+
- **Optimal**: Under 30 seconds for best quality and speed
|
459 |
+
- **Recommended**: Under 60 seconds
|
460 |
+
- **Maximum**: 180 seconds (3 minutes) - very slow processing
|
461 |
+
- For longer content, consider splitting audio into segments
|
462 |
"""
|
463 |
)
|
464 |
|