Spaces:

yl4579
/

DMOSpeech2-demo

Running on Zero

App Files Files Community

yl4579 commited on 25 days ago

Commit

a28c293

verified ·

1 Parent(s): 2f84825

Update app.py

Browse files

Files changed (1) hide show

app.py +338 -105

app.py CHANGED Viewed

@@ -1,117 +1,350 @@
-# Add this to your DMOInference class or create a wrapper
-import os
 import torch
 from pathlib import Path
 from huggingface_hub import hf_hub_download
-import re
-def load_checkpoint_from_hf(checkpoint_path, device='cpu'):
-    """
-    Load a checkpoint from either a local path or HuggingFace URL.
-    Supports:
-    - Local paths: /path/to/model.pt
-    - HF URLs: hf://username/repo/model.pt
-    - HF hub format: username/repo/model.pt
-    """
-    if isinstance(checkpoint_path, str):
-        # Check if it's a HuggingFace URL
-        if checkpoint_path.startswith("hf://"):
-            # Parse HF URL: hf://username/repo/path/to/model.pt
-            match = re.match(r"hf://([^/]+/[^/]+)/(.+)", checkpoint_path)
-            if match:
-                repo_id = match.group(1)
-                filename = match.group(2)
-                print(f"Loading from HuggingFace: {repo_id}/{filename}")
-                # Download from HuggingFace
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=filename,
-                    cache_dir=os.environ.get("HF_HOME", "./models")
-                )
-                # Load the checkpoint
-                return torch.load(local_path, map_location=device)
-        # Check if it's a HuggingFace repo format (username/repo/file.pt)
-        elif "/" in checkpoint_path and not os.path.exists(checkpoint_path):
-            parts = checkpoint_path.split("/")
-            if len(parts) >= 3:
-                repo_id = "/".join(parts[:2])
-                filename = "/".join(parts[2:])
-                print(f"Loading from HuggingFace: {repo_id}/{filename}")
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=filename,
-                    cache_dir=os.environ.get("HF_HOME", "./models")
-                )
-                return torch.load(local_path, map_location=device)
-        # Local file path
-        elif os.path.exists(checkpoint_path):
-            print(f"Loading from local path: {checkpoint_path}")
-            return torch.load(checkpoint_path, map_location=device)
-    raise ValueError(f"Could not load checkpoint from: {checkpoint_path}")
-# Modified DMOInference class init (partial)
-class DMOInference:
-    def __init__(
-        self,
-        student_checkpoint_path="",
-        duration_predictor_path="",
-        device="cuda",
-        model_type="F5TTS_Base",
-        tokenizer="pinyin",
-        dataset_name="Emilia_ZH_EN",
-        cuda_device_id="0"
-    ):
-        # ... (previous initialization code) ...
-        # Initialize components
-        self._setup_tokenizer()
-        self._setup_models(student_checkpoint_path)  # Modified to handle HF URLs
-        self._setup_mel_spec()
-        self._setup_vocoder()
-        self._setup_duration_predictor(duration_predictor_path)  # Modified to handle HF URLs
-    def _setup_models(self, student_checkpoint_path):
-        """Initialize teacher and student models with HF support."""
-        # ... (model configuration code) ...
-        # Load student checkpoint with HF support
-        checkpoint = load_checkpoint_from_hf(student_checkpoint_path, device='cpu')
-        self.model.load_state_dict(checkpoint['model_state_dict'], strict=False)
-        # ... (rest of the setup) ...
-    def _setup_duration_predictor(self, checkpoint_path):
-        """Initialize duration predictor with HF support."""
-        # ... (model initialization code) ...
-        # Load checkpoint with HF support
-        checkpoint = load_checkpoint_from_hf(checkpoint_path, device='cpu')
-        self.SLP.load_state_dict(checkpoint['model_state_dict'])
-# Wrapper class for easier use
-class DMOInferenceHF(DMOInference):
-    """DMOInference with built-in HuggingFace support."""
-    def __init__(self, **kwargs):
-        # Override checkpoint loading to support HF URLs
-        if 'student_checkpoint_path' in kwargs:
-            self._original_student_path = kwargs['student_checkpoint_path']
-        if 'duration_predictor_path' in kwargs:
-            self._original_duration_path = kwargs['duration_predictor_path']
-        super().__init__(**kwargs)
-    def _load_checkpoint(self, checkpoint_path):
-        """Load checkpoint with HF URL support."""
-        return load_checkpoint_from_hf(checkpoint_path, self.device)

+import gradio as gr
 import torch
+import torchaudio
+import numpy as np
+import tempfile
+import time
 from pathlib import Path
 from huggingface_hub import hf_hub_download
+import os
+# Import the inference module (assuming it's named 'infer.py' based on the notebook)
+from infer import DMOInference
+# Global model instance
+model = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def download_models():
+    """Download models from HuggingFace Hub."""
+    try:
+        print("Downloading models from HuggingFace...")
+        # Download student model
+        student_path = hf_hub_download(
+            repo_id="yl4579/DMOSpeech2",
+            filename="model_85000.pt",
+            cache_dir="./models"
+        )
+        # Download duration predictor
+        duration_path = hf_hub_download(
+            repo_id="yl4579/DMOSpeech2",
+            filename="model_1500.pt",
+            cache_dir="./models"
+        )
+        print(f"Student model: {student_path}")
+        print(f"Duration model: {duration_path}")
+        return student_path, duration_path
+    except Exception as e:
+        print(f"Error downloading models: {e}")
+        return None, None
+def initialize_model():
+    """Initialize the model on startup."""
+    global model
+    try:
+        # Download models
+        student_path, duration_path = download_models()
+        if not student_path or not duration_path:
+            return False, "Failed to download models from HuggingFace"
+        # Initialize model
+        model = DMOInference(
+            student_checkpoint_path=student_path,
+            duration_predictor_path=duration_path,
+            device=device,
+            model_type="F5TTS_Base"
+        )
+        return True, f"Model loaded successfully on {device.upper()}"
+    except Exception as e:
+        return False, f"Error initializing model: {str(e)}"
+# Initialize model on startup
+model_loaded, status_message = initialize_model()
+def generate_speech(
+    prompt_audio,
+    prompt_text,
+    target_text,
+    mode,
+    # Advanced settings
+    custom_teacher_steps,
+    custom_teacher_stopping_time,
+    custom_student_start_step,
+    temperature,
+    verbose
+):
+    """Generate speech with different configurations."""
+    if not model_loaded or model is None:
+        return None, "Model not loaded! Please refresh the page.", "", ""
+    if prompt_audio is None:
+        return None, "Please upload a reference audio!", "", ""
+    if not target_text:
+        return None, "Please enter text to generate!", "", ""
+    try:
+        start_time = time.time()
+        # Configure parameters based on mode
+        if mode == "Student Only (4 steps)":
+            teacher_steps = 0
+            student_start_step = 0
+            teacher_stopping_time = 1.0
+        elif mode == "Teacher-Guided (8 steps)":
+            # Default configuration from the notebook
+            teacher_steps = 16
+            teacher_stopping_time = 0.07
+            student_start_step = 1
+        elif mode == "High Diversity (16 steps)":
+            teacher_steps = 24
+            teacher_stopping_time = 0.3
+            student_start_step = 2
+        else:  # Custom
+            teacher_steps = custom_teacher_steps
+            teacher_stopping_time = custom_teacher_stopping_time
+            student_start_step = custom_student_start_step
+        # Generate speech
+        generated_audio = model.generate(
+            gen_text=target_text,
+            audio_path=prompt_audio,
+            prompt_text=prompt_text if prompt_text else None,
+            teacher_steps=teacher_steps,
+            teacher_stopping_time=teacher_stopping_time,
+            student_start_step=student_start_step,
+            temperature=temperature,
+            verbose=verbose
+        )
+        end_time = time.time()
+        # Calculate metrics
+        processing_time = end_time - start_time
+        audio_duration = generated_audio.shape[-1] / 24000
+        rtf = processing_time / audio_duration
+        # Save audio
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            output_path = tmp_file.name
+        if isinstance(generated_audio, np.ndarray):
+            generated_audio = torch.from_numpy(generated_audio)
+        if generated_audio.dim() == 1:
+            generated_audio = generated_audio.unsqueeze(0)
+        torchaudio.save(output_path, generated_audio, 24000)
+        # Format metrics
+        metrics = f"RTF: {rtf:.2f}x ({1/rtf:.2f}x speed) | Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio"
+        return output_path, "Success!", metrics, f"Mode: {mode}"
+    except Exception as e:
+        return None, f"Error: {str(e)}", "", ""
+# Create Gradio interface
+with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"""
+    # 🎙️ DMOSpeech 2: Zero-Shot Text-to-Speech
+    Generate natural speech in any voice with just a short reference audio!
+    **Model Status:** {status_message} | **Device:** {device.upper()}
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Reference audio input
+            prompt_audio = gr.Audio(
+                label="📎 Reference Audio",
+                type="filepath",
+                sources=["upload", "microphone"]
+            )
+            prompt_text = gr.Textbox(
+                label="📝 Reference Text (optional - will auto-transcribe if empty)",
+                placeholder="The text spoken in the reference audio...",
+                lines=2
+            )
+            target_text = gr.Textbox(
+                label="✍️ Text to Generate",
+                placeholder="Enter the text you want to synthesize...",
+                lines=4
+            )
+            # Generation mode
+            mode = gr.Radio(
+                choices=[
+                    "Student Only (4 steps)",
+                    "Teacher-Guided (8 steps)",
+                    "High Diversity (16 steps)",
+                    "Custom"
+                ],
+                value="Teacher-Guided (8 steps)",
+                label="🚀 Generation Mode",
+                info="Choose speed vs quality/diversity tradeoff"
+            )
+            # Advanced settings (collapsible)
+            with gr.Accordion("⚙️ Advanced Settings", open=False):
+                with gr.Row():
+                    custom_teacher_steps = gr.Slider(
+                        minimum=0,
+                        maximum=32,
+                        value=16,
+                        step=1,
+                        label="Teacher Steps",
+                        info="More steps = higher quality"
+                    )
+                    custom_teacher_stopping_time = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        value=0.07,
+                        step=0.01,
+                        label="Teacher Stopping Time",
+                        info="When to switch to student"
+                    )
+                    custom_student_start_step = gr.Slider(
+                        minimum=0,
+                        maximum=4,
+                        value=1,
+                        step=1,
+                        label="Student Start Step",
+                        info="Which student step to start from"
+                    )
+                temperature = gr.Slider(
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=0.0,
+                    step=0.1,
+                    label="Duration Temperature",
+                    info="0 = deterministic, >0 = more variation in speech rhythm"
+                )
+                verbose = gr.Checkbox(
+                    value=False,
+                    label="Verbose Output",
+                    info="Show detailed generation steps"
+                )
+            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            # Output
+            output_audio = gr.Audio(
+                label="🔊 Generated Speech",
+                type="filepath",
+                autoplay=True
+            )
+            status = gr.Textbox(
+                label="Status",
+                interactive=False
+            )
+            metrics = gr.Textbox(
+                label="Performance Metrics",
+                interactive=False
+            )
+            info = gr.Textbox(
+                label="Generation Info",
+                interactive=False
+            )
+            # Tips
+            gr.Markdown("""
+            ### 💡 Quick Tips:
+            - **Student Only**: Fastest (4 steps), good quality
+            - **Teacher-Guided**: Best balance (8 steps), recommended
+            - **High Diversity**: More natural prosody (16 steps)
+            - **Temperature**: Add randomness to speech rhythm
+            ### 📊 Expected RTF (Real-Time Factor):
+            - Student Only: ~0.05x (20x faster than real-time)
+            - Teacher-Guided: ~0.10x (10x faster)
+            - High Diversity: ~0.20x (5x faster)
+            """)
+    # Examples section
+    gr.Markdown("### 🎯 Examples")
+    examples = [
+        [
+            None,  # Will be replaced with actual audio path
+            "Some call me nature, others call me mother nature.",
+            "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring.",
+            "Teacher-Guided (8 steps)",
+            16, 0.07, 1, 0.0, False
+        ],
+        [
+            None,  # Will be replaced with actual audio path
+            "对，这就是我，万人敬仰的太乙真人。",
+            '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"',
+            "Teacher-Guided (8 steps)",
+            16, 0.07, 1, 0.0, False
+        ],
+        [
+            None,
+            "对，这就是我，万人敬仰的太乙真人。",
+            '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上��肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"',
+            "High Diversity (16 steps)",
+            24, 0.3, 2, 0.8, False
+        ]
+    ]
+    # Note about example audio files
+    gr.Markdown("""
+    *Note: Example audio files should be uploaded to the Space. The examples above show the text configurations used in the original notebook.*
+    """)
+    # Event handler
+    generate_btn.click(
+        generate_speech,
+        inputs=[
+            prompt_audio,
+            prompt_text,
+            target_text,
+            mode,
+            custom_teacher_steps,
+            custom_teacher_stopping_time,
+            custom_student_start_step,
+            temperature,
+            verbose
+        ],
+        outputs=[output_audio, status, metrics, info]
+    )
+    # Update visibility of custom settings based on mode
+    def update_custom_visibility(mode):
+        return gr.update(visible=(mode == "Custom"))
+    mode.change(
+        lambda x: [gr.update(interactive=(x == "Custom"))] * 3,
+        inputs=[mode],
+        outputs=[custom_teacher_steps, custom_teacher_stopping_time, custom_student_start_step]
+    )
+# Launch the app
+if __name__ == "__main__":
+    if not model_loaded:
+        print(f"Warning: Model failed to load - {status_message}")
+    demo.launch()