Spaces:

yl4579
/

DMOSpeech2-demo

Running on Zero

App Files Files Community

yl4579 commited on 30 days ago

Commit

4972f24

verified ·

1 Parent(s): 08848b6

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -51

app.py CHANGED Viewed

@@ -8,14 +8,65 @@ from pathlib import Path
 from huggingface_hub import hf_hub_download
 import os
 import spaces
-# Import the inference module (assuming it's named 'infer.py' based on the notebook)
 from infer import DMOInference
-# Global model instance
 model = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def download_models():
     """Download models from HuggingFace Hub."""
     try:
@@ -68,20 +119,21 @@ def initialize_model():
     except Exception as e:
         return False, f"Error initializing model: {str(e)}"
-# Initialize model on startup
 model_loaded, status_message = initialize_model()
-@spaces.GPU          # ZeroGPU allocates a slice only while this runs
 def generate_speech(
     prompt_audio,
     prompt_text,
     target_text,
     mode,
-    # Advanced settings
     custom_teacher_steps,
     custom_teacher_stopping_time,
     custom_student_start_step,
-    temperature,
     verbose
 ):
     """Generate speech with different configurations."""
@@ -96,6 +148,12 @@ def generate_speech(
         return None, "Please enter text to generate!", "", ""
     try:
         start_time = time.time()
         # Configure parameters based on mode
@@ -151,7 +209,7 @@ def generate_speech(
         # Format metrics
         metrics = f"RTF: {rtf:.2f}x ({1/rtf:.2f}x speed) | Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio"
-        return output_path, "Success!", metrics, f"Mode: {mode}"
     except Exception as e:
         return None, f"Error: {str(e)}", "", ""
@@ -163,7 +221,7 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
     Generate natural speech in any voice with just a short reference audio!
-    **Model Status:** {status_message} | **Device:** {device.upper()}
     """)
     with gr.Row():
@@ -176,7 +234,7 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
             )
             prompt_text = gr.Textbox(
-                label="📝 Reference Text (optional - will auto-transcribe if empty)",
                 placeholder="The text spoken in the reference audio...",
                 lines=2
             )
@@ -202,7 +260,17 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
             # Advanced settings (collapsible)
             with gr.Accordion("⚙️ Advanced Settings", open=False):
-                with gr.Row():
                     custom_teacher_steps = gr.Slider(
                         minimum=0,
                         maximum=32,
@@ -230,15 +298,6 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
                         info="Which student step to start from"
                     )
-                temperature = gr.Slider(
-                    minimum=0.0,
-                    maximum=2.0,
-                    value=0.0,
-                    step=0.1,
-                    label="Duration Temperature",
-                    info="0 = deterministic, >0 = more variation in speech rhythm"
-                )
                 verbose = gr.Checkbox(
                     value=False,
                     label="Verbose Output",
@@ -274,10 +333,11 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
             gr.Markdown("""
             ### 💡 Quick Tips:
             - **Student Only**: Fastest (4 steps), good quality
             - **Teacher-Guided**: Best balance (8 steps), recommended
             - **High Diversity**: More natural prosody (16 steps)
-            - **Temperature**: Add randomness to speech rhythm
             ### 📊 Expected RTF (Real-Time Factor):
             - Student Only: ~0.05x (20x faster than real-time)
@@ -286,35 +346,30 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
             """)
     # Examples section
-    gr.Markdown("### 🎯 Examples")
-    examples = [
-        [
-            None,  # Will be replaced with actual audio path
-            "Some call me nature, others call me mother nature.",
-            "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring.",
-            "Teacher-Guided (8 steps)",
-            16, 0.07, 1, 0.0, False
-        ],
-        [
-            None,  # Will be replaced with actual audio path
-            "对，这就是我，万人敬仰的太乙真人。",
-            '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"',
-            "Teacher-Guided (8 steps)",
-            16, 0.07, 1, 0.0, False
-        ],
-        [
-            None,
-            "对，这就是我，万人敬仰的太乙真人。",
-            '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"',
-            "High Diversity (16 steps)",
-            24, 0.3, 2, 0.8, False
-        ]
-    ]
-    # Note about example audio files
     gr.Markdown("""
-    *Note: Example audio files should be uploaded to the Space. The examples above show the text configurations used in the original notebook.*
     """)
     # Event handler
@@ -325,10 +380,10 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
             prompt_text,
             target_text,
             mode,
             custom_teacher_steps,
             custom_teacher_stopping_time,
             custom_student_start_step,
-            temperature,
             verbose
         ],
         outputs=[output_audio, status, metrics, info]
@@ -336,17 +391,20 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
     # Update visibility of custom settings based on mode
     def update_custom_visibility(mode):
-        return gr.update(visible=(mode == "Custom"))
     mode.change(
-        lambda x: [gr.update(interactive=(x == "Custom"))] * 3,
         inputs=[mode],
-        outputs=[custom_teacher_steps, custom_teacher_stopping_time, custom_student_start_step]
     )
 # Launch the app
 if __name__ == "__main__":
     if not model_loaded:
         print(f"Warning: Model failed to load - {status_message}")
     demo.launch()

 from huggingface_hub import hf_hub_download
 import os
 import spaces
+from transformers import pipeline
+# Import the inference module
 from infer import DMOInference
+# Global variables
 model = None
+asr_pipe = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Initialize ASR pipeline
+def initialize_asr_pipeline(device=device, dtype=None):
+    """Initialize the ASR pipeline on startup."""
+    global asr_pipe
+    if dtype is None:
+        dtype = (
+            torch.float16
+            if "cuda" in device
+            and torch.cuda.is_available()
+            and torch.cuda.get_device_properties(device).major >= 7
+            and not torch.cuda.get_device_name().endswith("[ZLUDA]")
+            else torch.float32
+        )
+    print("Initializing ASR pipeline...")
+    try:
+        asr_pipe = pipeline(
+            "automatic-speech-recognition",
+            model="openai/whisper-large-v3-turbo",
+            torch_dtype=dtype,
+            device="cpu"  # Keep ASR on CPU to save GPU memory
+        )
+        print("ASR pipeline initialized successfully")
+    except Exception as e:
+        print(f"Error initializing ASR pipeline: {e}")
+        asr_pipe = None
+# Transcribe function
+def transcribe(ref_audio, language=None):
+    """Transcribe audio using the pre-loaded ASR pipeline."""
+    global asr_pipe
+    if asr_pipe is None:
+        return ""  # Return empty string if ASR is not available
+    try:
+        result = asr_pipe(
+            ref_audio,
+            chunk_length_s=30,
+            batch_size=128,
+            generate_kwargs={"task": "transcribe", "language": language} if language else {"task": "transcribe"},
+            return_timestamps=False,
+        )
+        return result["text"].strip()
+    except Exception as e:
+        print(f"Transcription error: {e}")
+        return ""
 def download_models():
     """Download models from HuggingFace Hub."""
     try:
     except Exception as e:
         return False, f"Error initializing model: {str(e)}"
+# Initialize models on startup
+print("Initializing models...")
 model_loaded, status_message = initialize_model()
+initialize_asr_pipeline()  # Initialize ASR pipeline
+@spaces.GPU(duration=120)  # Request GPU for up to 120 seconds
 def generate_speech(
     prompt_audio,
     prompt_text,
     target_text,
     mode,
+    temperature,
     custom_teacher_steps,
     custom_teacher_stopping_time,
     custom_student_start_step,
     verbose
 ):
     """Generate speech with different configurations."""
         return None, "Please enter text to generate!", "", ""
     try:
+        # Auto-transcribe if prompt_text is empty
+        if not prompt_text and prompt_text != "":
+            print("Auto-transcribing reference audio...")
+            prompt_text = transcribe(prompt_audio)
+            print(f"Transcribed: {prompt_text}")
         start_time = time.time()
         # Configure parameters based on mode
         # Format metrics
         metrics = f"RTF: {rtf:.2f}x ({1/rtf:.2f}x speed) | Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio"
+        return output_path, "Success!", metrics, f"Mode: {mode} | Transcribed: {prompt_text[:50]}..." if not prompt_text else f"Mode: {mode}"
     except Exception as e:
         return None, f"Error: {str(e)}", "", ""
     Generate natural speech in any voice with just a short reference audio!
+    **Model Status:** {status_message} | **Device:** {device.upper()} | **ASR:** {"✅ Ready" if asr_pipe else "❌ Not available"}
     """)
     with gr.Row():
             )
             prompt_text = gr.Textbox(
+                label="📝 Reference Text (leave empty for auto-transcription)",
                 placeholder="The text spoken in the reference audio...",
                 lines=2
             )
             # Advanced settings (collapsible)
             with gr.Accordion("⚙️ Advanced Settings", open=False):
+                temperature = gr.Slider(
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=0.0,
+                    step=0.1,
+                    label="Duration Temperature",
+                    info="0 = deterministic, >0 = more variation in speech rhythm"
+                )
+                with gr.Group(visible=False) as custom_settings:
+                    gr.Markdown("### Custom Mode Settings")
                     custom_teacher_steps = gr.Slider(
                         minimum=0,
                         maximum=32,
                         info="Which student step to start from"
                     )
                 verbose = gr.Checkbox(
                     value=False,
                     label="Verbose Output",
             gr.Markdown("""
             ### 💡 Quick Tips:
+            - **Auto-transcription**: Leave reference text empty to auto-transcribe
             - **Student Only**: Fastest (4 steps), good quality
             - **Teacher-Guided**: Best balance (8 steps), recommended
             - **High Diversity**: More natural prosody (16 steps)
+            - **Custom Mode**: Fine-tune all parameters
             ### 📊 Expected RTF (Real-Time Factor):
             - Student Only: ~0.05x (20x faster than real-time)
             """)
     # Examples section
+    gr.Markdown("### 🎯 Example Configurations")
     gr.Markdown("""
+    <details>
+    <summary>English Example</summary>
+    **Reference text:** "Some call me nature, others call me mother nature."
+    **Target text:** "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
+    </details>
+    <details>
+    <summary>Chinese Example</summary>
+    **Reference text:** "对，这就是我，万人敬仰的太乙真人。"
+    **Target text:** "突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道：'我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？'"
+    </details>
+    <details>
+    <summary>High Diversity Chinese Example</summary>
+    Same as above but with **Temperature: 0.8** for more natural variation in speech rhythm.
+    </details>
     """)
     # Event handler
             prompt_text,
             target_text,
             mode,
+            temperature,
             custom_teacher_steps,
             custom_teacher_stopping_time,
             custom_student_start_step,
             verbose
         ],
         outputs=[output_audio, status, metrics, info]
     # Update visibility of custom settings based on mode
     def update_custom_visibility(mode):
+        is_custom = (mode == "Custom")
+        return gr.update(visible=is_custom)
     mode.change(
+        update_custom_visibility,
         inputs=[mode],
+        outputs=[custom_settings]
     )
 # Launch the app
 if __name__ == "__main__":
     if not model_loaded:
         print(f"Warning: Model failed to load - {status_message}")
+    if not asr_pipe:
+        print("Warning: ASR pipeline not available - auto-transcription disabled")
     demo.launch()