Spaces:

multimodalart
/

MoDA-fast-talking-head

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 11 days ago

Commit

220c7ea

verified ·

1 Parent(s): 55f9dd0

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -4

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ import spaces
 from huggingface_hub import snapshot_download
 from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError
 from pathlib import Path
 # Add the src directory to the system path to allow for local imports
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
@@ -55,6 +57,45 @@ def download_weights():
     else:
         print(f"Found existing weights at '{WEIGHTS_DIR}'. Skipping download.")
 # --- Initialization ---
 # Create output directory if it doesn't exist
@@ -94,6 +135,10 @@ def generate_motion(source_image_path, driving_audio_path, emotion_name, cfg_sca
     start_time = time.time()
     # Create a unique subdirectory for this run
     timestamp = time.strftime("%Y%m%d-%H%M%S")
     run_output_dir = os.path.join(OUTPUT_DIR, timestamp)
@@ -104,15 +149,16 @@ def generate_motion(source_image_path, driving_audio_path, emotion_name, cfg_sca
     print(f"Starting generation with the following parameters:")
     print(f"  Source Image: {source_image_path}")
-    print(f"  Driving Audio: {driving_audio_path}")
     print(f"  Emotion: {emotion_name} (ID: {emotion_id})")
     print(f"  CFG Scale: {cfg_scale}")
     try:
-        # Call the pipeline's inference method
         result_video_path = pipeline.driven_sample(
             image_path=source_image_path,
-            audio_path=driving_audio_path,
             cfg_scale=float(cfg_scale),
             emo=emotion_id,
             save_dir=".",
@@ -124,6 +170,14 @@ def generate_motion(source_image_path, driving_audio_path, emotion_name, cfg_sca
         import traceback
         traceback.print_exc()
         raise gr.Error(f"An unexpected error occurred: {str(e)}. Please check the console for details.")
     end_time = time.time()
@@ -150,6 +204,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !i
             </p>
             <p>
             This demo allows you to generate a talking head video from a source image and a driving audio file.
             </p>
         </div>
         """
@@ -161,7 +216,11 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !i
                 source_image = gr.Image(label="Source Image", type="filepath", value="src/examples/reference_images/6.jpg")
             with gr.Row():
-                driving_audio = gr.Audio(label="Driving Audio", type="filepath", value="src/examples/driving_audios/5.wav")
             with gr.Row():
                 emotion_dropdown = gr.Dropdown(

 from huggingface_hub import snapshot_download
 from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError
 from pathlib import Path
+import tempfile
+from pydub import AudioSegment
 # Add the src directory to the system path to allow for local imports
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
     else:
         print(f"Found existing weights at '{WEIGHTS_DIR}'. Skipping download.")
+# --- Audio Conversion Function ---
+def ensure_wav_format(audio_path):
+    """
+    Ensures the audio file is in WAV format. If not, converts it to WAV.
+    Returns the path to the WAV file (either original or converted).
+    """
+    if audio_path is None:
+        return None
+    audio_path = Path(audio_path)
+    # Check if already WAV
+    if audio_path.suffix.lower() == '.wav':
+        print(f"Audio is already in WAV format: {audio_path}")
+        return str(audio_path)
+    # Convert to WAV
+    print(f"Converting audio from {audio_path.suffix} to WAV format...")
+    try:
+        # Load the audio file
+        audio = AudioSegment.from_file(audio_path)
+        # Create a temporary WAV file
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
+            wav_path = tmp_file.name
+            # Export as WAV with standard settings
+            audio.export(
+                wav_path,
+                format='wav',
+                parameters=["-ar", "16000", "-ac", "1"]  # 16kHz, mono - adjust if your model needs different settings
+            )
+        print(f"Audio converted successfully to: {wav_path}")
+        return wav_path
+    except Exception as e:
+        print(f"Error converting audio: {e}")
+        raise gr.Error(f"Failed to convert audio file to WAV format. Error: {e}")
 # --- Initialization ---
 # Create output directory if it doesn't exist
     start_time = time.time()
+    # Ensure audio is in WAV format
+    wav_audio_path = ensure_wav_format(driving_audio_path)
+    temp_wav_created = wav_audio_path != driving_audio_path
     # Create a unique subdirectory for this run
     timestamp = time.strftime("%Y%m%d-%H%M%S")
     run_output_dir = os.path.join(OUTPUT_DIR, timestamp)
     print(f"Starting generation with the following parameters:")
     print(f"  Source Image: {source_image_path}")
+    print(f"  Driving Audio (original): {driving_audio_path}")
+    print(f"  Driving Audio (WAV): {wav_audio_path}")
     print(f"  Emotion: {emotion_name} (ID: {emotion_id})")
     print(f"  CFG Scale: {cfg_scale}")
     try:
+        # Call the pipeline's inference method with the WAV audio
         result_video_path = pipeline.driven_sample(
             image_path=source_image_path,
+            audio_path=wav_audio_path,
             cfg_scale=float(cfg_scale),
             emo=emotion_id,
             save_dir=".",
         import traceback
         traceback.print_exc()
         raise gr.Error(f"An unexpected error occurred: {str(e)}. Please check the console for details.")
+    finally:
+        # Clean up temporary WAV file if created
+        if temp_wav_created and os.path.exists(wav_audio_path):
+            try:
+                os.remove(wav_audio_path)
+                print(f"Cleaned up temporary WAV file: {wav_audio_path}")
+            except Exception as e:
+                print(f"Warning: Could not delete temporary file {wav_audio_path}: {e}")
     end_time = time.time()
             </p>
             <p>
             This demo allows you to generate a talking head video from a source image and a driving audio file.
+            Audio files in any common format (MP3, WAV, M4A, etc.) are supported and will be automatically converted if needed.
             </p>
         </div>
         """
                 source_image = gr.Image(label="Source Image", type="filepath", value="src/examples/reference_images/6.jpg")
             with gr.Row():
+                driving_audio = gr.Audio(
+                    label="Driving Audio (any format - will be converted to WAV if needed)",
+                    type="filepath",
+                    value="src/examples/driving_audios/5.wav"
+                )
             with gr.Row():
                 emotion_dropdown = gr.Dropdown(