Spaces:

yl4579
/

DMOSpeech2-demo

Running on Zero

App Files Files Community

yl4579 commited on 26 days ago

Commit

51a612e

verified ·

1 Parent(s): 4972f24

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -213

app.py CHANGED Viewed

@@ -14,37 +14,61 @@ from transformers import pipeline
 from infer import DMOInference
 # Global variables
-model = None
 asr_pipe = None
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Initialize ASR pipeline
-def initialize_asr_pipeline(device=device, dtype=None):
-    """Initialize the ASR pipeline on startup."""
-    global asr_pipe
-    if dtype is None:
-        dtype = (
-            torch.float16
-            if "cuda" in device
-            and torch.cuda.is_available()
-            and torch.cuda.get_device_properties(device).major >= 7
-            and not torch.cuda.get_device_name().endswith("[ZLUDA]")
-            else torch.float32
         )
     print("Initializing ASR pipeline...")
     try:
         asr_pipe = pipeline(
             "automatic-speech-recognition",
             model="openai/whisper-large-v3-turbo",
-            torch_dtype=dtype,
-            device="cpu"  # Keep ASR on CPU to save GPU memory
         )
-        print("ASR pipeline initialized successfully")
     except Exception as e:
         print(f"Error initializing ASR pipeline: {e}")
-        asr_pipe = None
 # Transcribe function
 def transcribe(ref_audio, language=None):
@@ -52,7 +76,7 @@ def transcribe(ref_audio, language=None):
     global asr_pipe
     if asr_pipe is None:
-        return ""  # Return empty string if ASR is not available
     try:
         result = asr_pipe(
@@ -67,65 +91,14 @@ def transcribe(ref_audio, language=None):
         print(f"Transcription error: {e}")
         return ""
-def download_models():
-    """Download models from HuggingFace Hub."""
-    try:
-        print("Downloading models from HuggingFace...")
-        # Download student model
-        student_path = hf_hub_download(
-            repo_id="yl4579/DMOSpeech2",
-            filename="model_85000.pt",
-            cache_dir="./models"
-        )
-        # Download duration predictor
-        duration_path = hf_hub_download(
-            repo_id="yl4579/DMOSpeech2",
-            filename="model_1500.pt",
-            cache_dir="./models"
-        )
-        print(f"Student model: {student_path}")
-        print(f"Duration model: {duration_path}")
-        return student_path, duration_path
-    except Exception as e:
-        print(f"Error downloading models: {e}")
-        return None, None
-def initialize_model():
-    """Initialize the model on startup."""
-    global model
-    try:
-        # Download models
-        student_path, duration_path = download_models()
-        if not student_path or not duration_path:
-            return False, "Failed to download models from HuggingFace"
-        # Initialize model
-        model = DMOInference(
-            student_checkpoint_path=student_path,
-            duration_predictor_path=duration_path,
-            device=device,
-            model_type="F5TTS_Base"
-        )
-        return True, f"Model loaded successfully on {device.upper()}"
-    except Exception as e:
-        return False, f"Error initializing model: {str(e)}"
-# Initialize models on startup
-print("Initializing models...")
-model_loaded, status_message = initialize_model()
-initialize_asr_pipeline()  # Initialize ASR pipeline
-@spaces.GPU(duration=120)  # Request GPU for up to 120 seconds
-def generate_speech(
     prompt_audio,
     prompt_text,
     target_text,
@@ -136,53 +109,72 @@ def generate_speech(
     custom_student_start_step,
     verbose
 ):
-    """Generate speech with different configurations."""
-    if not model_loaded or model is None:
-        return None, "Model not loaded! Please refresh the page.", "", ""
     if prompt_audio is None:
-        return None, "Please upload a reference audio!", "", ""
     if not target_text:
-        return None, "Please enter text to generate!", "", ""
     try:
-        # Auto-transcribe if prompt_text is empty
-        if not prompt_text and prompt_text != "":
             print("Auto-transcribing reference audio...")
-            prompt_text = transcribe(prompt_audio)
-            print(f"Transcribed: {prompt_text}")
         start_time = time.time()
         # Configure parameters based on mode
-        if mode == "Student Only (4 steps)":
-            teacher_steps = 0
-            student_start_step = 0
-            teacher_stopping_time = 1.0
-        elif mode == "Teacher-Guided (8 steps)":
-            # Default configuration from the notebook
-            teacher_steps = 16
-            teacher_stopping_time = 0.07
-            student_start_step = 1
-        elif mode == "High Diversity (16 steps)":
-            teacher_steps = 24
-            teacher_stopping_time = 0.3
-            student_start_step = 2
-        else:  # Custom
-            teacher_steps = custom_teacher_steps
-            teacher_stopping_time = custom_teacher_stopping_time
-            student_start_step = custom_student_start_step
         # Generate speech
         generated_audio = model.generate(
             gen_text=target_text,
             audio_path=prompt_audio,
-            prompt_text=prompt_text if prompt_text else None,
-            teacher_steps=teacher_steps,
-            teacher_stopping_time=teacher_stopping_time,
-            student_start_step=student_start_step,
             temperature=temperature,
             verbose=verbose
         )
@@ -206,29 +198,50 @@ def generate_speech(
         torchaudio.save(output_path, generated_audio, 24000)
-        # Format metrics
-        metrics = f"RTF: {rtf:.2f}x ({1/rtf:.2f}x speed) | Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio"
-        return output_path, "Success!", metrics, f"Mode: {mode} | Transcribed: {prompt_text[:50]}..." if not prompt_text else f"Mode: {mode}"
     except Exception as e:
-        return None, f"Error: {str(e)}", "", ""
 # Create Gradio interface
-with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(f"""
-    # 🎙️ DMOSpeech 2: Zero-Shot Text-to-Speech
-    Generate natural speech in any voice with just a short reference audio!
-    **Model Status:** {status_message} | **Device:** {device.upper()} | **ASR:** {"✅ Ready" if asr_pipe else "❌ Not available"}
     """)
     with gr.Row():
         with gr.Column(scale=1):
-            # Reference audio input
             prompt_audio = gr.Audio(
-                label="📎 Reference Audio",
                 type="filepath",
                 sources=["upload", "microphone"]
             )
@@ -245,7 +258,6 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
                 lines=4
             )
-            # Generation mode
             mode = gr.Radio(
                 choices=[
                     "Student Only (4 steps)",
@@ -255,10 +267,10 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
                 ],
                 value="Teacher-Guided (8 steps)",
                 label="🚀 Generation Mode",
-                info="Choose speed vs quality/diversity tradeoff"
             )
-            # Advanced settings (collapsible)
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 temperature = gr.Slider(
                     minimum=0.0,
@@ -266,115 +278,76 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
                     value=0.0,
                     step=0.1,
                     label="Duration Temperature",
-                    info="0 = deterministic, >0 = more variation in speech rhythm"
                 )
-                with gr.Group(visible=False) as custom_settings:
-                    gr.Markdown("### Custom Mode Settings")
-                    custom_teacher_steps = gr.Slider(
-                        minimum=0,
-                        maximum=32,
-                        value=16,
-                        step=1,
-                        label="Teacher Steps",
-                        info="More steps = higher quality"
-                    )
-                    custom_teacher_stopping_time = gr.Slider(
-                        minimum=0.0,
-                        maximum=1.0,
-                        value=0.07,
-                        step=0.01,
-                        label="Teacher Stopping Time",
-                        info="When to switch to student"
-                    )
-                    custom_student_start_step = gr.Slider(
-                        minimum=0,
-                        maximum=4,
-                        value=1,
-                        step=1,
-                        label="Student Start Step",
-                        info="Which student step to start from"
-                    )
-                verbose = gr.Checkbox(
-                    value=False,
-                    label="Verbose Output",
-                    info="Show detailed generation steps"
-                )
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
-            # Output
             output_audio = gr.Audio(
                 label="🔊 Generated Speech",
                 type="filepath",
                 autoplay=True
             )
-            status = gr.Textbox(
-                label="Status",
-                interactive=False
-            )
-            metrics = gr.Textbox(
-                label="Performance Metrics",
-                interactive=False
-            )
-            info = gr.Textbox(
-                label="Generation Info",
-                interactive=False
-            )
-            # Tips
             gr.Markdown("""
-            ### 💡 Quick Tips:
-            - **Auto-transcription**: Leave reference text empty to auto-transcribe
-            - **Student Only**: Fastest (4 steps), good quality
-            - **Teacher-Guided**: Best balance (8 steps), recommended
-            - **High Diversity**: More natural prosody (16 steps)
-            - **Custom Mode**: Fine-tune all parameters
-            ### 📊 Expected RTF (Real-Time Factor):
-            - Student Only: ~0.05x (20x faster than real-time)
-            - Teacher-Guided: ~0.10x (10x faster)
-            - High Diversity: ~0.20x (5x faster)
             """)
-    # Examples section
-    gr.Markdown("### 🎯 Example Configurations")
     gr.Markdown("""
     <details>
     <summary>English Example</summary>
-    **Reference text:** "Some call me nature, others call me mother nature."
-    **Target text:** "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
     </details>
     <details>
     <summary>Chinese Example</summary>
-    **Reference text:** "对，这就是我，万人敬仰的太乙真人。"
-    **Target text:** "突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道：'我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？'"
     </details>
-    <details>
-    <summary>High Diversity Chinese Example</summary>
-    Same as above but with **Temperature: 0.8** for more natural variation in speech rhythm.
-    </details>
-    """)
-    # Event handler
     generate_btn.click(
-        generate_speech,
         inputs=[
             prompt_audio,
             prompt_text,
@@ -386,25 +359,15 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
             custom_student_start_step,
             verbose
         ],
-        outputs=[output_audio, status, metrics, info]
-    )
-    # Update visibility of custom settings based on mode
-    def update_custom_visibility(mode):
-        is_custom = (mode == "Custom")
-        return gr.update(visible=is_custom)
-    mode.change(
-        update_custom_visibility,
-        inputs=[mode],
-        outputs=[custom_settings]
     )
-# Launch the app
 if __name__ == "__main__":
-    if not model_loaded:
-        print(f"Warning: Model failed to load - {status_message}")
-    if not asr_pipe:
-        print("Warning: ASR pipeline not available - auto-transcription disabled")
     demo.launch()

 from infer import DMOInference
 # Global variables
+model_paths = {"student": None, "duration": None}
 asr_pipe = None
+model_downloaded = False
+# Download models on startup (CPU)
+def download_models():
+    """Download models from HuggingFace Hub."""
+    global model_downloaded, model_paths
+    try:
+        print("Downloading models from HuggingFace...")
+        # Download student model
+        student_path = hf_hub_download(
+            repo_id="yl4579/DMOSpeech2",
+            filename="model_85000.pt",
+            cache_dir="./models"
         )
+        # Download duration predictor
+        duration_path = hf_hub_download(
+            repo_id="yl4579/DMOSpeech2",
+            filename="model_1500.pt",
+            cache_dir="./models"
+        )
+        model_paths["student"] = student_path
+        model_paths["duration"] = duration_path
+        model_downloaded = True
+        print(f"✓ Models downloaded successfully")
+        return True
+    except Exception as e:
+        print(f"Error downloading models: {e}")
+        return False
+# Initialize ASR pipeline on CPU
+def initialize_asr_pipeline():
+    """Initialize the ASR pipeline on startup."""
+    global asr_pipe
     print("Initializing ASR pipeline...")
     try:
         asr_pipe = pipeline(
             "automatic-speech-recognition",
             model="openai/whisper-large-v3-turbo",
+            torch_dtype=torch.float32,
+            device="cpu"  # Always use CPU for ASR to save GPU memory
         )
+        print("✓ ASR pipeline initialized successfully")
+        return True
     except Exception as e:
         print(f"Error initializing ASR pipeline: {e}")
+        return False
 # Transcribe function
 def transcribe(ref_audio, language=None):
     global asr_pipe
     if asr_pipe is None:
+        return ""
     try:
         result = asr_pipe(
         print(f"Transcription error: {e}")
         return ""
+# Initialize on startup
+print("Starting DMOSpeech 2...")
+models_ready = download_models()
+asr_ready = initialize_asr_pipeline()
+status_message = f"Models: {'✅' if models_ready else '❌'} | ASR: {'✅' if asr_ready else '❌'}"
+@spaces.GPU(duration=120)
+def generate_speech_gpu(
     prompt_audio,
     prompt_text,
     target_text,
     custom_student_start_step,
     verbose
 ):
+    """Generate speech with GPU acceleration."""
+    if not model_downloaded:
+        return None, "❌ Models not downloaded! Please refresh the page.", "", "", prompt_text
     if prompt_audio is None:
+        return None, "❌ Please upload a reference audio!", "", "", prompt_text
     if not target_text:
+        return None, "❌ Please enter text to generate!", "", "", prompt_text
     try:
+        # Initialize model on GPU
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Initializing model on {device}...")
+        model = DMOInference(
+            student_checkpoint_path=model_paths["student"],
+            duration_predictor_path=model_paths["duration"],
+            device=device,
+            model_type="F5TTS_Base"
+        )
+        # Auto-transcribe if needed (this happens on CPU)
+        transcribed_text = prompt_text  # Default to provided text
+        if not prompt_text.strip():
             print("Auto-transcribing reference audio...")
+            transcribed_text = transcribe(prompt_audio)
+            print(f"Transcribed: {transcribed_text}")
         start_time = time.time()
         # Configure parameters based on mode
+        configs = {
+            "Student Only (4 steps)": {
+                "teacher_steps": 0,
+                "student_start_step": 0,
+                "teacher_stopping_time": 1.0
+            },
+            "Teacher-Guided (8 steps)": {
+                "teacher_steps": 16,
+                "teacher_stopping_time": 0.07,
+                "student_start_step": 1
+            },
+            "High Diversity (16 steps)": {
+                "teacher_steps": 24,
+                "teacher_stopping_time": 0.3,
+                "student_start_step": 2
+            },
+            "Custom": {
+                "teacher_steps": custom_teacher_steps,
+                "teacher_stopping_time": custom_teacher_stopping_time,
+                "student_start_step": custom_student_start_step
+            }
+        }
+        config = configs[mode]
         # Generate speech
         generated_audio = model.generate(
             gen_text=target_text,
             audio_path=prompt_audio,
+            prompt_text=transcribed_text if transcribed_text else None,
+            teacher_steps=config["teacher_steps"],
+            teacher_stopping_time=config["teacher_stopping_time"],
+            student_start_step=config["student_start_step"],
             temperature=temperature,
             verbose=verbose
         )
         torchaudio.save(output_path, generated_audio, 24000)
+        # Format output
+        metrics = f"""RTF: {rtf:.2f}x ({1/rtf:.2f}x faster)
+Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio
+Device: {device.upper()}"""
+        info = f"Mode: {mode}"
+        if not prompt_text.strip():
+            info += f" | Auto-transcribed"
+        # Clean up GPU memory
+        del model
+        if device == "cuda":
+            torch.cuda.empty_cache()
+        # Return transcribed text to update the textbox
+        return output_path, "✅ Success!", metrics, info, transcribed_text
     except Exception as e:
+        import traceback
+        print(traceback.format_exc())
+        return None, f"❌ Error: {str(e)}", "", "", prompt_text
 # Create Gradio interface
+with gr.Blocks(
+    title="DMOSpeech 2 - Zero-Shot TTS",
+    theme=gr.themes.Soft(),
+    css="""
+    .gradio-container { max-width: 1200px !important; }
+    """
+) as demo:
+    gr.Markdown(f"""
+    <div style="text-align: center;">
+        <h1>🎙️ DMOSpeech 2: Zero-Shot Text-to-Speech</h1>
+        <p>Generate natural speech in any voice with just a 3-10 second reference!</p>
+        <p><b>System Status:</b> {status_message}</p>
+    </div>
     """)
     with gr.Row():
         with gr.Column(scale=1):
+            # Inputs
             prompt_audio = gr.Audio(
+                label="📎 Reference Audio (3-10 seconds)",
                 type="filepath",
                 sources=["upload", "microphone"]
             )
                 lines=4
             )
             mode = gr.Radio(
                 choices=[
                     "Student Only (4 steps)",
                 ],
                 value="Teacher-Guided (8 steps)",
                 label="🚀 Generation Mode",
+                info="Speed vs quality tradeoff"
             )
+            # Advanced settings
             with gr.Accordion("⚙️ Advanced Settings", open=False):
                 temperature = gr.Slider(
                     minimum=0.0,
                     value=0.0,
                     step=0.1,
                     label="Duration Temperature",
+                    info="0 = consistent, >0 = varied rhythm"
                 )
+                with gr.Group(visible=False) as custom_group:
+                    custom_teacher_steps = gr.Slider(0, 32, 16, 1, label="Teacher Steps")
+                    custom_teacher_stopping_time = gr.Slider(0.0, 1.0, 0.07, 0.01, label="Stopping Time")
+                    custom_student_start_step = gr.Slider(0, 4, 1, 1, label="Student Start Step")
+                verbose = gr.Checkbox(False, label="Verbose Output")
             generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
+            # Outputs
             output_audio = gr.Audio(
                 label="🔊 Generated Speech",
                 type="filepath",
                 autoplay=True
             )
+            status = gr.Textbox(label="Status", interactive=False)
+            metrics = gr.Textbox(label="Performance", interactive=False, lines=3)
+            info = gr.Textbox(label="Info", interactive=False)
+            # Guide
             gr.Markdown("""
+            ### 💡 Quick Guide
+            | Mode | Speed | Quality | Use Case |
+            |------|-------|---------|----------|
+            | Student Only | 20x realtime | Good | Real-time apps |
+            | Teacher-Guided | 10x realtime | Better | General use |
+            | High Diversity | 5x realtime | Best | Production |
+            **Tips:**
+            - Leave reference text empty for auto-transcription
+            - Auto-transcription only happens once - the text will be filled in
+            - Use temperature > 0 for more natural rhythm variation
+            - Custom mode lets you fine-tune all parameters
             """)
+    # Examples
+    gr.Markdown("### 🎯 Example Texts")
     gr.Markdown("""
     <details>
     <summary>English Example</summary>
+    **Reference:** "Some call me nature, others call me mother nature."
+    **Target:** "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
     </details>
     <details>
     <summary>Chinese Example</summary>
+    **Reference:** "对，这就是我，万人敬仰的太乙真人。"
+    **Target:** "突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道：'我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？'"
     </details>
+    """)
+    # Event handlers
+    def toggle_custom(mode):
+        return gr.update(visible=(mode == "Custom"))
+    mode.change(toggle_custom, [mode], [custom_group])
     generate_btn.click(
+        generate_speech_gpu,
         inputs=[
             prompt_audio,
             prompt_text,
             custom_student_start_step,
             verbose
         ],
+        outputs=[
+            output_audio,
+            status,
+            metrics,
+            info,
+            prompt_text  # Update the prompt_text textbox with transcribed text
+        ]
     )
+# Launch
 if __name__ == "__main__":
     demo.launch()