Spaces:

yl4579
/

DMOSpeech2-demo

Running on Zero

App Files Files Community

yl4579 commited on 26 days ago

Commit

2f84825

verified ·

1 Parent(s): 1b8d1f0

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -338

app.py CHANGED Viewed

@@ -1,350 +1,117 @@
-import gradio as gr
 import torch
-import torchaudio
-import numpy as np
-import tempfile
-import time
 from pathlib import Path
 from huggingface_hub import hf_hub_download
-import os
-# Import the inference module (assuming it's named 'infer.py' based on the notebook)
-from infer import DMOInference
-# Global model instance
-model = None
-device = "cuda" if torch.cuda.is_available() else "cpu"
-def download_models():
-    """Download models from HuggingFace Hub."""
-    try:
-        print("Downloading models from HuggingFace...")
-        # Download student model
-        student_path = hf_hub_download(
-            repo_id="yl4579/DMOSpeech2",
-            filename="model_85000.pt",
-            cache_dir="./models"
-        )
-        # Download duration predictor
-        duration_path = hf_hub_download(
-            repo_id="yl4579/DMOSpeech2",
-            filename="model_1500.pt",
-            cache_dir="./models"
-        )
-        print(f"Student model: {student_path}")
-        print(f"Duration model: {duration_path}")
-        return student_path, duration_path
-    except Exception as e:
-        print(f"Error downloading models: {e}")
-        return None, None
-def initialize_model():
-    """Initialize the model on startup."""
-    global model
-    try:
-        # Download models
-        student_path, duration_path = download_models()
-        if not student_path or not duration_path:
-            return False, "Failed to download models from HuggingFace"
-        # Initialize model
-        model = DMOInference(
-            student_checkpoint_path=student_path,
-            duration_predictor_path=duration_path,
-            device=device,
-            model_type="F5TTS_Base"
-        )
-        return True, f"Model loaded successfully on {device.upper()}"
-    except Exception as e:
-        return False, f"Error initializing model: {str(e)}"
-# Initialize model on startup
-model_loaded, status_message = initialize_model()
-def generate_speech(
-    prompt_audio,
-    prompt_text,
-    target_text,
-    mode,
-    # Advanced settings
-    custom_teacher_steps,
-    custom_teacher_stopping_time,
-    custom_student_start_step,
-    temperature,
-    verbose
-):
-    """Generate speech with different configurations."""
-    if not model_loaded or model is None:
-        return None, "Model not loaded! Please refresh the page.", "", ""
-    if prompt_audio is None:
-        return None, "Please upload a reference audio!", "", ""
-    if not target_text:
-        return None, "Please enter text to generate!", "", ""
-    try:
-        start_time = time.time()
-        # Configure parameters based on mode
-        if mode == "Student Only (4 steps)":
-            teacher_steps = 0
-            student_start_step = 0
-            teacher_stopping_time = 1.0
-        elif mode == "Teacher-Guided (8 steps)":
-            # Default configuration from the notebook
-            teacher_steps = 16
-            teacher_stopping_time = 0.07
-            student_start_step = 1
-        elif mode == "High Diversity (16 steps)":
-            teacher_steps = 24
-            teacher_stopping_time = 0.3
-            student_start_step = 2
-        else:  # Custom
-            teacher_steps = custom_teacher_steps
-            teacher_stopping_time = custom_teacher_stopping_time
-            student_start_step = custom_student_start_step
-        # Generate speech
-        generated_audio = model.generate(
-            gen_text=target_text,
-            audio_path=prompt_audio,
-            prompt_text=prompt_text if prompt_text else None,
-            teacher_steps=teacher_steps,
-            teacher_stopping_time=teacher_stopping_time,
-            student_start_step=student_start_step,
-            temperature=temperature,
-            verbose=verbose
-        )
-        end_time = time.time()
-        # Calculate metrics
-        processing_time = end_time - start_time
-        audio_duration = generated_audio.shape[-1] / 24000
-        rtf = processing_time / audio_duration
-        # Save audio
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-            output_path = tmp_file.name
-        if isinstance(generated_audio, np.ndarray):
-            generated_audio = torch.from_numpy(generated_audio)
-        if generated_audio.dim() == 1:
-            generated_audio = generated_audio.unsqueeze(0)
-        torchaudio.save(output_path, generated_audio, 24000)
-        # Format metrics
-        metrics = f"RTF: {rtf:.2f}x ({1/rtf:.2f}x speed) | Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio"
-        return output_path, "Success!", metrics, f"Mode: {mode}"
-    except Exception as e:
-        return None, f"Error: {str(e)}", "", ""
-# Create Gradio interface
-with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(f"""
-    # 🎙️ DMOSpeech 2: Zero-Shot Text-to-Speech
-    Generate natural speech in any voice with just a short reference audio!
-    **Model Status:** {status_message} | **Device:** {device.upper()}
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            # Reference audio input
-            prompt_audio = gr.Audio(
-                label="📎 Reference Audio",
-                type="filepath",
-                sources=["upload", "microphone"]
-            )
-            prompt_text = gr.Textbox(
-                label="📝 Reference Text (optional - will auto-transcribe if empty)",
-                placeholder="The text spoken in the reference audio...",
-                lines=2
-            )
-            target_text = gr.Textbox(
-                label="✍️ Text to Generate",
-                placeholder="Enter the text you want to synthesize...",
-                lines=4
-            )
-            # Generation mode
-            mode = gr.Radio(
-                choices=[
-                    "Student Only (4 steps)",
-                    "Teacher-Guided (8 steps)",
-                    "High Diversity (16 steps)",
-                    "Custom"
-                ],
-                value="Teacher-Guided (8 steps)",
-                label="🚀 Generation Mode",
-                info="Choose speed vs quality/diversity tradeoff"
-            )
-            # Advanced settings (collapsible)
-            with gr.Accordion("⚙️ Advanced Settings", open=False):
-                with gr.Row():
-                    custom_teacher_steps = gr.Slider(
-                        minimum=0,
-                        maximum=32,
-                        value=16,
-                        step=1,
-                        label="Teacher Steps",
-                        info="More steps = higher quality"
-                    )
-                    custom_teacher_stopping_time = gr.Slider(
-                        minimum=0.0,
-                        maximum=1.0,
-                        value=0.07,
-                        step=0.01,
-                        label="Teacher Stopping Time",
-                        info="When to switch to student"
-                    )
-                    custom_student_start_step = gr.Slider(
-                        minimum=0,
-                        maximum=4,
-                        value=1,
-                        step=1,
-                        label="Student Start Step",
-                        info="Which student step to start from"
-                    )
-                temperature = gr.Slider(
-                    minimum=0.0,
-                    maximum=2.0,
-                    value=0.0,
-                    step=0.1,
-                    label="Duration Temperature",
-                    info="0 = deterministic, >0 = more variation in speech rhythm"
-                )
-                verbose = gr.Checkbox(
-                    value=False,
-                    label="Verbose Output",
-                    info="Show detailed generation steps"
-                )
-            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
-        with gr.Column(scale=1):
-            # Output
-            output_audio = gr.Audio(
-                label="🔊 Generated Speech",
-                type="filepath",
-                autoplay=True
-            )
-            status = gr.Textbox(
-                label="Status",
-                interactive=False
-            )
-            metrics = gr.Textbox(
-                label="Performance Metrics",
-                interactive=False
-            )
-            info = gr.Textbox(
-                label="Generation Info",
-                interactive=False
-            )
-            # Tips
-            gr.Markdown("""
-            ### 💡 Quick Tips:
-            - **Student Only**: Fastest (4 steps), good quality
-            - **Teacher-Guided**: Best balance (8 steps), recommended
-            - **High Diversity**: More natural prosody (16 steps)
-            - **Temperature**: Add randomness to speech rhythm
-            ### 📊 Expected RTF (Real-Time Factor):
-            - Student Only: ~0.05x (20x faster than real-time)
-            - Teacher-Guided: ~0.10x (10x faster)
-            - High Diversity: ~0.20x (5x faster)
-            """)
-    # Examples section
-    gr.Markdown("### 🎯 Examples")
-    examples = [
-        [
-            None,  # Will be replaced with actual audio path
-            "Some call me nature, others call me mother nature.",
-            "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring.",
-            "Teacher-Guided (8 steps)",
-            16, 0.07, 1, 0.0, False
-        ],
-        [
-            None,  # Will be replaced with actual audio path
-            "对，这就是我，万人敬仰的太乙真人。",
-            '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"',
-            "Teacher-Guided (8 steps)",
-            16, 0.07, 1, 0.0, False
-        ],
-        [
-            None,
-            "对，这就是我，万人敬仰的太乙真人。",
-            '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上的肉，是为了掩饰我爆棚的��力，否则，岂不吓坏了你们呢？"',
-            "High Diversity (16 steps)",
-            24, 0.3, 2, 0.8, False
-        ]
-    ]
-    # Note about example audio files
-    gr.Markdown("""
-    *Note: Example audio files should be uploaded to the Space. The examples above show the text configurations used in the original notebook.*
-    """)
-    # Event handler
-    generate_btn.click(
-        generate_speech,
-        inputs=[
-            prompt_audio,
-            prompt_text,
-            target_text,
-            mode,
-            custom_teacher_steps,
-            custom_teacher_stopping_time,
-            custom_student_start_step,
-            temperature,
-            verbose
-        ],
-        outputs=[output_audio, status, metrics, info]
-    )
-    # Update visibility of custom settings based on mode
-    def update_custom_visibility(mode):
-        return gr.update(visible=(mode == "Custom"))
-    mode.change(
-        lambda x: [gr.update(interactive=(x == "Custom"))] * 3,
-        inputs=[mode],
-        outputs=[custom_teacher_steps, custom_teacher_stopping_time, custom_student_start_step]
-    )
-# Launch the app
-if __name__ == "__main__":
-    if not model_loaded:
-        print(f"Warning: Model failed to load - {status_message}")
-    demo.launch()

+# Add this to your DMOInference class or create a wrapper
+import os
 import torch
 from pathlib import Path
 from huggingface_hub import hf_hub_download
+import re
+def load_checkpoint_from_hf(checkpoint_path, device='cpu'):
+    """
+    Load a checkpoint from either a local path or HuggingFace URL.
+    Supports:
+    - Local paths: /path/to/model.pt
+    - HF URLs: hf://username/repo/model.pt
+    - HF hub format: username/repo/model.pt
+    """
+    if isinstance(checkpoint_path, str):
+        # Check if it's a HuggingFace URL
+        if checkpoint_path.startswith("hf://"):
+            # Parse HF URL: hf://username/repo/path/to/model.pt
+            match = re.match(r"hf://([^/]+/[^/]+)/(.+)", checkpoint_path)
+            if match:
+                repo_id = match.group(1)
+                filename = match.group(2)
+                print(f"Loading from HuggingFace: {repo_id}/{filename}")
+                # Download from HuggingFace
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=filename,
+                    cache_dir=os.environ.get("HF_HOME", "./models")
+                )
+                # Load the checkpoint
+                return torch.load(local_path, map_location=device)
+        # Check if it's a HuggingFace repo format (username/repo/file.pt)
+        elif "/" in checkpoint_path and not os.path.exists(checkpoint_path):
+            parts = checkpoint_path.split("/")
+            if len(parts) >= 3:
+                repo_id = "/".join(parts[:2])
+                filename = "/".join(parts[2:])
+                print(f"Loading from HuggingFace: {repo_id}/{filename}")
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=filename,
+                    cache_dir=os.environ.get("HF_HOME", "./models")
+                )
+                return torch.load(local_path, map_location=device)
+        # Local file path
+        elif os.path.exists(checkpoint_path):
+            print(f"Loading from local path: {checkpoint_path}")
+            return torch.load(checkpoint_path, map_location=device)
+    raise ValueError(f"Could not load checkpoint from: {checkpoint_path}")
+# Modified DMOInference class init (partial)
+class DMOInference:
+    def __init__(
+        self,
+        student_checkpoint_path="",
+        duration_predictor_path="",
+        device="cuda",
+        model_type="F5TTS_Base",
+        tokenizer="pinyin",
+        dataset_name="Emilia_ZH_EN",
+        cuda_device_id="0"
+    ):
+        # ... (previous initialization code) ...
+        # Initialize components
+        self._setup_tokenizer()
+        self._setup_models(student_checkpoint_path)  # Modified to handle HF URLs
+        self._setup_mel_spec()
+        self._setup_vocoder()
+        self._setup_duration_predictor(duration_predictor_path)  # Modified to handle HF URLs
+    def _setup_models(self, student_checkpoint_path):
+        """Initialize teacher and student models with HF support."""
+        # ... (model configuration code) ...
+        # Load student checkpoint with HF support
+        checkpoint = load_checkpoint_from_hf(student_checkpoint_path, device='cpu')
+        self.model.load_state_dict(checkpoint['model_state_dict'], strict=False)
+        # ... (rest of the setup) ...
+    def _setup_duration_predictor(self, checkpoint_path):
+        """Initialize duration predictor with HF support."""
+        # ... (model initialization code) ...
+        # Load checkpoint with HF support
+        checkpoint = load_checkpoint_from_hf(checkpoint_path, device='cpu')
+        self.SLP.load_state_dict(checkpoint['model_state_dict'])
+# Wrapper class for easier use
+class DMOInferenceHF(DMOInference):
+    """DMOInference with built-in HuggingFace support."""
+    def __init__(self, **kwargs):
+        # Override checkpoint loading to support HF URLs
+        if 'student_checkpoint_path' in kwargs:
+            self._original_student_path = kwargs['student_checkpoint_path']
+        if 'duration_predictor_path' in kwargs:
+            self._original_duration_path = kwargs['duration_predictor_path']
+        super().__init__(**kwargs)
+    def _load_checkpoint(self, checkpoint_path):
+        """Load checkpoint with HF URL support."""
+        return load_checkpoint_from_hf(checkpoint_path, self.device)