Spaces:

yl4579
/

DMOSpeech2-demo

Running on Zero

App Files Files Community

yl4579 commited on 27 days ago

Commit

7b1f9ef

verified ·

1 Parent(s): 51a612e

Update app.py

Browse files

Files changed (1) hide show

app.py +213 -176

app.py CHANGED Viewed

@@ -14,61 +14,37 @@ from transformers import pipeline
 from infer import DMOInference
 # Global variables
-model_paths = {"student": None, "duration": None}
 asr_pipe = None
-model_downloaded = False
-# Download models on startup (CPU)
-def download_models():
-    """Download models from HuggingFace Hub."""
-    global model_downloaded, model_paths
-    try:
-        print("Downloading models from HuggingFace...")
-        # Download student model
-        student_path = hf_hub_download(
-            repo_id="yl4579/DMOSpeech2",
-            filename="model_85000.pt",
-            cache_dir="./models"
-        )
-        # Download duration predictor
-        duration_path = hf_hub_download(
-            repo_id="yl4579/DMOSpeech2",
-            filename="model_1500.pt",
-            cache_dir="./models"
-        )
-        model_paths["student"] = student_path
-        model_paths["duration"] = duration_path
-        model_downloaded = True
-        print(f"✓ Models downloaded successfully")
-        return True
-    except Exception as e:
-        print(f"Error downloading models: {e}")
-        return False
-# Initialize ASR pipeline on CPU
-def initialize_asr_pipeline():
     """Initialize the ASR pipeline on startup."""
     global asr_pipe
     print("Initializing ASR pipeline...")
     try:
         asr_pipe = pipeline(
             "automatic-speech-recognition",
             model="openai/whisper-large-v3-turbo",
-            torch_dtype=torch.float32,
-            device="cpu"  # Always use CPU for ASR to save GPU memory
         )
-        print("✓ ASR pipeline initialized successfully")
-        return True
     except Exception as e:
         print(f"Error initializing ASR pipeline: {e}")
-        return False
 # Transcribe function
 def transcribe(ref_audio, language=None):
@@ -76,7 +52,7 @@ def transcribe(ref_audio, language=None):
     global asr_pipe
     if asr_pipe is None:
-        return ""
     try:
         result = asr_pipe(
@@ -91,14 +67,65 @@ def transcribe(ref_audio, language=None):
         print(f"Transcription error: {e}")
         return ""
-# Initialize on startup
-print("Starting DMOSpeech 2...")
-models_ready = download_models()
-asr_ready = initialize_asr_pipeline()
-status_message = f"Models: {'✅' if models_ready else '❌'} | ASR: {'✅' if asr_ready else '❌'}"
-@spaces.GPU(duration=120)
-def generate_speech_gpu(
     prompt_audio,
     prompt_text,
     target_text,
@@ -109,72 +136,53 @@ def generate_speech_gpu(
     custom_student_start_step,
     verbose
 ):
-    """Generate speech with GPU acceleration."""
-    if not model_downloaded:
-        return None, "❌ Models not downloaded! Please refresh the page.", "", "", prompt_text
     if prompt_audio is None:
-        return None, "❌ Please upload a reference audio!", "", "", prompt_text
     if not target_text:
-        return None, "❌ Please enter text to generate!", "", "", prompt_text
     try:
-        # Initialize model on GPU
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"Initializing model on {device}...")
-        model = DMOInference(
-            student_checkpoint_path=model_paths["student"],
-            duration_predictor_path=model_paths["duration"],
-            device=device,
-            model_type="F5TTS_Base"
-        )
-        # Auto-transcribe if needed (this happens on CPU)
-        transcribed_text = prompt_text  # Default to provided text
-        if not prompt_text.strip():
             print("Auto-transcribing reference audio...")
-            transcribed_text = transcribe(prompt_audio)
-            print(f"Transcribed: {transcribed_text}")
         start_time = time.time()
         # Configure parameters based on mode
-        configs = {
-            "Student Only (4 steps)": {
-                "teacher_steps": 0,
-                "student_start_step": 0,
-                "teacher_stopping_time": 1.0
-            },
-            "Teacher-Guided (8 steps)": {
-                "teacher_steps": 16,
-                "teacher_stopping_time": 0.07,
-                "student_start_step": 1
-            },
-            "High Diversity (16 steps)": {
-                "teacher_steps": 24,
-                "teacher_stopping_time": 0.3,
-                "student_start_step": 2
-            },
-            "Custom": {
-                "teacher_steps": custom_teacher_steps,
-                "teacher_stopping_time": custom_teacher_stopping_time,
-                "student_start_step": custom_student_start_step
-            }
-        }
-        config = configs[mode]
         # Generate speech
         generated_audio = model.generate(
             gen_text=target_text,
             audio_path=prompt_audio,
-            prompt_text=transcribed_text if transcribed_text else None,
-            teacher_steps=config["teacher_steps"],
-            teacher_stopping_time=config["teacher_stopping_time"],
-            student_start_step=config["student_start_step"],
             temperature=temperature,
             verbose=verbose
         )
@@ -198,50 +206,29 @@ def generate_speech_gpu(
         torchaudio.save(output_path, generated_audio, 24000)
-        # Format output
-        metrics = f"""RTF: {rtf:.2f}x ({1/rtf:.2f}x faster)
-Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio
-Device: {device.upper()}"""
-        info = f"Mode: {mode}"
-        if not prompt_text.strip():
-            info += f" | Auto-transcribed"
-        # Clean up GPU memory
-        del model
-        if device == "cuda":
-            torch.cuda.empty_cache()
-        # Return transcribed text to update the textbox
-        return output_path, "✅ Success!", metrics, info, transcribed_text
     except Exception as e:
-        import traceback
-        print(traceback.format_exc())
-        return None, f"❌ Error: {str(e)}", "", "", prompt_text
 # Create Gradio interface
-with gr.Blocks(
-    title="DMOSpeech 2 - Zero-Shot TTS",
-    theme=gr.themes.Soft(),
-    css="""
-    .gradio-container { max-width: 1200px !important; }
-    """
-) as demo:
     gr.Markdown(f"""
-    <div style="text-align: center;">
-        <h1>🎙️ DMOSpeech 2: Zero-Shot Text-to-Speech</h1>
-        <p>Generate natural speech in any voice with just a 3-10 second reference!</p>
-        <p><b>System Status:</b> {status_message}</p>
-    </div>
     """)
     with gr.Row():
         with gr.Column(scale=1):
-            # Inputs
             prompt_audio = gr.Audio(
-                label="📎 Reference Audio (3-10 seconds)",
                 type="filepath",
                 sources=["upload", "microphone"]
             )
@@ -258,6 +245,7 @@ with gr.Blocks(
                 lines=4
             )
             mode = gr.Radio(
                 choices=[
                     "Student Only (4 steps)",
@@ -267,10 +255,10 @@ with gr.Blocks(
                 ],
                 value="Teacher-Guided (8 steps)",
                 label="🚀 Generation Mode",
-                info="Speed vs quality tradeoff"
             )
-            # Advanced settings
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 temperature = gr.Slider(
                     minimum=0.0,
@@ -278,76 +266,115 @@ with gr.Blocks(
                     value=0.0,
                     step=0.1,
                     label="Duration Temperature",
-                    info="0 = consistent, >0 = varied rhythm"
                 )
-                with gr.Group(visible=False) as custom_group:
-                    custom_teacher_steps = gr.Slider(0, 32, 16, 1, label="Teacher Steps")
-                    custom_teacher_stopping_time = gr.Slider(0.0, 1.0, 0.07, 0.01, label="Stopping Time")
-                    custom_student_start_step = gr.Slider(0, 4, 1, 1, label="Student Start Step")
-                verbose = gr.Checkbox(False, label="Verbose Output")
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
-            # Outputs
             output_audio = gr.Audio(
                 label="🔊 Generated Speech",
                 type="filepath",
                 autoplay=True
             )
-            status = gr.Textbox(label="Status", interactive=False)
-            metrics = gr.Textbox(label="Performance", interactive=False, lines=3)
-            info = gr.Textbox(label="Info", interactive=False)
-            # Guide
             gr.Markdown("""
-            ### 💡 Quick Guide
-            | Mode | Speed | Quality | Use Case |
-            |------|-------|---------|----------|
-            | Student Only | 20x realtime | Good | Real-time apps |
-            | Teacher-Guided | 10x realtime | Better | General use |
-            | High Diversity | 5x realtime | Best | Production |
-            **Tips:**
-            - Leave reference text empty for auto-transcription
-            - Auto-transcription only happens once - the text will be filled in
-            - Use temperature > 0 for more natural rhythm variation
-            - Custom mode lets you fine-tune all parameters
             """)
-    # Examples
-    gr.Markdown("### 🎯 Example Texts")
     gr.Markdown("""
     <details>
     <summary>English Example</summary>
-    **Reference:** "Some call me nature, others call me mother nature."
-    **Target:** "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
     </details>
     <details>
     <summary>Chinese Example</summary>
-    **Reference:** "对，这就是我，万人敬仰的太乙真人。"
-    **Target:** "突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道：'我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？'"
     </details>
-    """)
-    # Event handlers
-    def toggle_custom(mode):
-        return gr.update(visible=(mode == "Custom"))
-    mode.change(toggle_custom, [mode], [custom_group])
     generate_btn.click(
-        generate_speech_gpu,
         inputs=[
             prompt_audio,
             prompt_text,
@@ -359,15 +386,25 @@ with gr.Blocks(
             custom_student_start_step,
             verbose
         ],
-        outputs=[
-            output_audio,
-            status,
-            metrics,
-            info,
-            prompt_text  # Update the prompt_text textbox with transcribed text
-        ]
     )
-# Launch
 if __name__ == "__main__":
     demo.launch()

 from infer import DMOInference
 # Global variables
+model = None
 asr_pipe = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Initialize ASR pipeline
+def initialize_asr_pipeline(device=device, dtype=None):
     """Initialize the ASR pipeline on startup."""
     global asr_pipe
+    if dtype is None:
+        dtype = (
+            torch.float16
+            if "cuda" in device
+            and torch.cuda.is_available()
+            and torch.cuda.get_device_properties(device).major >= 7
+            and not torch.cuda.get_device_name().endswith("[ZLUDA]")
+            else torch.float32
+        )
     print("Initializing ASR pipeline...")
     try:
         asr_pipe = pipeline(
             "automatic-speech-recognition",
             model="openai/whisper-large-v3-turbo",
+            torch_dtype=dtype,
+            device="cpu"  # Keep ASR on CPU to save GPU memory
         )
+        print("ASR pipeline initialized successfully")
     except Exception as e:
         print(f"Error initializing ASR pipeline: {e}")
+        asr_pipe = None
 # Transcribe function
 def transcribe(ref_audio, language=None):
     global asr_pipe
     if asr_pipe is None:
+        return ""  # Return empty string if ASR is not available
     try:
         result = asr_pipe(
         print(f"Transcription error: {e}")
         return ""
+def download_models():
+    """Download models from HuggingFace Hub."""
+    try:
+        print("Downloading models from HuggingFace...")
+        # Download student model
+        student_path = hf_hub_download(
+            repo_id="yl4579/DMOSpeech2",
+            filename="model_85000.pt",
+            cache_dir="./models"
+        )
+        # Download duration predictor
+        duration_path = hf_hub_download(
+            repo_id="yl4579/DMOSpeech2",
+            filename="model_1500.pt",
+            cache_dir="./models"
+        )
+        print(f"Student model: {student_path}")
+        print(f"Duration model: {duration_path}")
+        return student_path, duration_path
+    except Exception as e:
+        print(f"Error downloading models: {e}")
+        return None, None
+def initialize_model():
+    """Initialize the model on startup."""
+    global model
+    try:
+        # Download models
+        student_path, duration_path = download_models()
+        if not student_path or not duration_path:
+            return False, "Failed to download models from HuggingFace"
+        # Initialize model
+        model = DMOInference(
+            student_checkpoint_path=student_path,
+            duration_predictor_path=duration_path,
+            device=device,
+            model_type="F5TTS_Base"
+        )
+        return True, f"Model loaded successfully on {device.upper()}"
+    except Exception as e:
+        return False, f"Error initializing model: {str(e)}"
+# Initialize models on startup
+print("Initializing models...")
+model_loaded, status_message = initialize_model()
+initialize_asr_pipeline()  # Initialize ASR pipeline
+@spaces.GPU(duration=120)  # Request GPU for up to 120 seconds
+def generate_speech(
     prompt_audio,
     prompt_text,
     target_text,
     custom_student_start_step,
     verbose
 ):
+    """Generate speech with different configurations."""
+    if not model_loaded or model is None:
+        return None, "Model not loaded! Please refresh the page.", "", ""
     if prompt_audio is None:
+        return None, "Please upload a reference audio!", "", ""
     if not target_text:
+        return None, "Please enter text to generate!", "", ""
     try:
+        # Auto-transcribe if prompt_text is empty
+        if not prompt_text and prompt_text != "":
             print("Auto-transcribing reference audio...")
+            prompt_text = transcribe(prompt_audio)
+            print(f"Transcribed: {prompt_text}")
         start_time = time.time()
         # Configure parameters based on mode
+        if mode == "Student Only (4 steps)":
+            teacher_steps = 0
+            student_start_step = 0
+            teacher_stopping_time = 1.0
+        elif mode == "Teacher-Guided (8 steps)":
+            # Default configuration from the notebook
+            teacher_steps = 16
+            teacher_stopping_time = 0.07
+            student_start_step = 1
+        elif mode == "High Diversity (16 steps)":
+            teacher_steps = 24
+            teacher_stopping_time = 0.3
+            student_start_step = 2
+        else:  # Custom
+            teacher_steps = custom_teacher_steps
+            teacher_stopping_time = custom_teacher_stopping_time
+            student_start_step = custom_student_start_step
         # Generate speech
         generated_audio = model.generate(
             gen_text=target_text,
             audio_path=prompt_audio,
+            prompt_text=prompt_text if prompt_text else None,
+            teacher_steps=teacher_steps,
+            teacher_stopping_time=teacher_stopping_time,
+            student_start_step=student_start_step,
             temperature=temperature,
             verbose=verbose
         )
         torchaudio.save(output_path, generated_audio, 24000)
+        # Format metrics
+        metrics = f"RTF: {rtf:.2f}x ({1/rtf:.2f}x speed) | Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio"
+        return output_path, "Success!", metrics, f"Mode: {mode} | Transcribed: {prompt_text[:50]}..." if not prompt_text else f"Mode: {mode}"
     except Exception as e:
+        return None, f"Error: {str(e)}", "", ""
 # Create Gradio interface
+with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"""
+    # 🎙️ DMOSpeech 2: Zero-Shot Text-to-Speech
+    Generate natural speech in any voice with just a short reference audio!
+    **Model Status:** {status_message} | **Device:** {device.upper()} | **ASR:** {"✅ Ready" if asr_pipe else "❌ Not available"}
     """)
     with gr.Row():
         with gr.Column(scale=1):
+            # Reference audio input
             prompt_audio = gr.Audio(
+                label="📎 Reference Audio",
                 type="filepath",
                 sources=["upload", "microphone"]
             )
                 lines=4
             )
+            # Generation mode
             mode = gr.Radio(
                 choices=[
                     "Student Only (4 steps)",
                 ],
                 value="Teacher-Guided (8 steps)",
                 label="🚀 Generation Mode",
+                info="Choose speed vs quality/diversity tradeoff"
             )
+            # Advanced settings (collapsible)
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 temperature = gr.Slider(
                     minimum=0.0,
                     value=0.0,
                     step=0.1,
                     label="Duration Temperature",
+                    info="0 = deterministic, >0 = more variation in speech rhythm"
                 )
+                with gr.Group(visible=False) as custom_settings:
+                    gr.Markdown("### Custom Mode Settings")
+                    custom_teacher_steps = gr.Slider(
+                        minimum=0,
+                        maximum=32,
+                        value=16,
+                        step=1,
+                        label="Teacher Steps",
+                        info="More steps = higher quality"
+                    )
+                    custom_teacher_stopping_time = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.07,
+                        step=0.01,
+                        label="Teacher Stopping Time",
+                        info="When to switch to student"
+                    )
+                    custom_student_start_step = gr.Slider(
+                        minimum=0,
+                        maximum=4,
+                        value=1,
+                        step=1,
+                        label="Student Start Step",
+                        info="Which student step to start from"
+                    )
+                verbose = gr.Checkbox(
+                    value=False,
+                    label="Verbose Output",
+                    info="Show detailed generation steps"
+                )
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
+            # Output
             output_audio = gr.Audio(
                 label="🔊 Generated Speech",
                 type="filepath",
                 autoplay=True
             )
+            status = gr.Textbox(
+                label="Status",
+                interactive=False
+            )
+            metrics = gr.Textbox(
+                label="Performance Metrics",
+                interactive=False
+            )
+            info = gr.Textbox(
+                label="Generation Info",
+                interactive=False
+            )
+            # Tips
             gr.Markdown("""
+            ### 💡 Quick Tips:
+            - **Auto-transcription**: Leave reference text empty to auto-transcribe
+            - **Student Only**: Fastest (4 steps), good quality
+            - **Teacher-Guided**: Best balance (8 steps), recommended
+            - **High Diversity**: More natural prosody (16 steps)
+            - **Custom Mode**: Fine-tune all parameters
+            ### 📊 Expected RTF (Real-Time Factor):
+            - Student Only: ~0.05x (20x faster than real-time)
+            - Teacher-Guided: ~0.10x (10x faster)
+            - High Diversity: ~0.20x (5x faster)
             """)
+    # Examples section
+    gr.Markdown("### 🎯 Example Configurations")
     gr.Markdown("""
     <details>
     <summary>English Example</summary>
+    **Reference text:** "Some call me nature, others call me mother nature."
+    **Target text:** "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
     </details>
     <details>
     <summary>Chinese Example</summary>
+    **Reference text:** "对，这就是我，万人敬仰的太乙真人。"
+    **Target text:** "突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道：'我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？'"
     </details>
+    <details>
+    <summary>High Diversity Chinese Example</summary>
+    Same as above but with **Temperature: 0.8** for more natural variation in speech rhythm.
+    </details>
+    """)
+    # Event handler
     generate_btn.click(
+        generate_speech,
         inputs=[
             prompt_audio,
             prompt_text,
             custom_student_start_step,
             verbose
         ],
+        outputs=[output_audio, status, metrics, info]
+    )
+    # Update visibility of custom settings based on mode
+    def update_custom_visibility(mode):
+        is_custom = (mode == "Custom")
+        return gr.update(visible=is_custom)
+    mode.change(
+        update_custom_visibility,
+        inputs=[mode],
+        outputs=[custom_settings]
     )
+# Launch the app
 if __name__ == "__main__":
+    if not model_loaded:
+        print(f"Warning: Model failed to load - {status_message}")
+    if not asr_pipe:
+        print("Warning: ASR pipeline not available - auto-transcription disabled")
     demo.launch()