Spaces:

fnlp
/

MOSS-TTSD

Running on Zero

File size: 20,347 Bytes

import spaces
import gradio as gr
import torch
import torchaudio
import tempfile
import json
import os
from typing import Optional, Tuple

from generation_utils import load_model, process_batch

def load_examples_from_jsonl():
    """
    Load examples from examples/examples.jsonl and convert to ROLE_EXAMPLES format
    """
    examples = []
    jsonl_path = "examples/examples.jsonl"
    
    if not os.path.exists(jsonl_path):
        print(f"Warning: {jsonl_path} not found")
        return []

    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()

            data = json.loads(line)
            
            # Extract required fields
            text = data.get('text', '')
            base_path = data.get('base_path', 'examples')
            
            # Check if this is a role-based example (has speaker1 and speaker2 audio)
            if 'prompt_audio_speaker1' in data and 'prompt_audio_speaker2' in data:
                # Role mode example
                audio_mode = "Role"
                prompt_audio_1 = os.path.join(base_path, data['prompt_audio_speaker1'])
                prompt_text_1 = data.get('prompt_text_speaker1', '')
                prompt_audio_2 = os.path.join(base_path, data['prompt_audio_speaker2'])
                prompt_text_2 = data.get('prompt_text_speaker2', '')
                use_normalize = True
                
                example = [text, audio_mode, prompt_audio_1, prompt_text_1, prompt_audio_2, prompt_text_2, use_normalize]
                examples.append(example)
    
    print(f"Loaded {len(examples)} examples from {jsonl_path}")
    return examples

# Load examples from JSONL file
ROLE_EXAMPLES = load_examples_from_jsonl()

# Language configuration
LANGUAGES = {
    "English": {
        "title": "MOSS-TTSD🪐 Dialogue Generation",
        "script_input": "### Script Input",
        "text_to_synthesize": "Text to Synthesize",
        "text_placeholder": "Text to be synthesized, format: [S1]Role1 text[S2]Role2 text",
        "use_normalize": "Use text normalization",
        "normalize_info": "Recommended to enable, improves handling of numbers, punctuation, etc.",
        "audio_input_mode": "### Audio Input Mode",
        "select_input_mode": "Select input mode",
        "mode_info": "Single Audio: Upload one audio with [S1][S2] text; Role Audio: Upload separate audio for Role1 and Role2",
        "drag_drop_audio": "Drag and drop audio here - or - click to upload",
        "prompt_text": "Prompt Text",
        "prompt_placeholder": "Format: [S1]Role1 text[S2]Role2 text",
        "role1_audio": "**Role1 Audio**",
        "role1_audio_file": "Role1 Audio File",
        "role1_text": "Role1 Text",
        "role1_placeholder": "Role1 text content",
        "role2_audio": "**Role2 Audio**",
        "role2_audio_file": "Role2 Audio File",
        "role2_text": "Role2 Text",
        "role2_placeholder": "Role2 text content",
        "generate_audio": "Generate Audio",
        "generated_audio": "Generated Audio",
        "status_info": "Status Information",
        "examples": "### Examples",
        "examples_desc": "Click on examples below to auto-fill the form",
        "role_headers": ["Text to Synthesize", "Input Mode", "Role1 Audio File", "Role1 Text", "Role2 Audio File", "Role2 Text", "Use Normalize"]
    },
    "中文": {
        "title": "MOSS-TTSD🪐 对话语音生成",
        "script_input": "### 文本输入",
        "text_to_synthesize": "要合成的文本",
        "text_placeholder": "要合成的文本，格式：[S1]角色1文本[S2]角色2文本",
        "use_normalize": "使用文本规范化",
        "normalize_info": "建议启用，改善数字、标点符号等的处理",
        "audio_input_mode": "### 音频输入模式",
        "select_input_mode": "选择输入模式",
        "mode_info": "单音频：上传一个包含[S1][S2]文本的音频；角色音频：分别为角色1和角色2上传音频",
        "drag_drop_audio": "拖拽音频文件到此处 - 或 - 点击上传",
        "prompt_text": "提示文本",
        "prompt_placeholder": "格式：[S1]角色1文本[S2]角色2文本",
        "role1_audio": "**角色1音频**",
        "role1_audio_file": "角色1音频文件",
        "role1_text": "角色1文本",
        "role1_placeholder": "角色1文本内容",
        "role2_audio": "**角色2音频**",
        "role2_audio_file": "角色2音频文件",
        "role2_text": "角色2文本",
        "role2_placeholder": "角色2文本内容",
        "generate_audio": "生成音频",
        "generated_audio": "生成的音频",
        "status_info": "状态信息",
        "examples": "### 示例",
        "examples_desc": "点击下方示例自动填充表单",
        "role_headers": ["要合成的文本", "输入模式", "角色1音频文件", "角色1文本", "角色2音频文件", "角色2文本", "使用规范化"]
    }
}

# Model configuration
SYSTEM_PROMPT = "You are a speech synthesizer that generates natural, realistic, and human-like conversational audio from dialogue text."
MODEL_PATH = "fnlp/MOSS-TTSD-v0"
SPT_CONFIG_PATH = "XY_Tokenizer/config/xy_tokenizer_config.yaml"
# SPT_CHECKPOINT_PATH = "XY_Tokenizer/weights/xy_tokenizer.ckpt"
MAX_CHANNELS = 8

from huggingface_hub import hf_hub_download

SPT_CHECKPOINT_PATH = hf_hub_download(
    repo_id="fnlp/XY_Tokenizer_TTSD_V0",
    filename="xy_tokenizer.ckpt",
    cache_dir="XY_Tokenizer/weights"
)

print("Checkpoint downloaded to:", SPT_CHECKPOINT_PATH)

# Global variables for caching loaded models
tokenizer = None
model = None
spt = None
device = None

def initialize_model():
    """Initialize model (load only on first call)"""
    global tokenizer, model, spt, device
    
    if tokenizer is None:
        print("Initializing model...")
        device = "cuda"
        print(f"Using {device}")
        tokenizer, model, spt = load_model(MODEL_PATH, SPT_CONFIG_PATH, SPT_CHECKPOINT_PATH)
        spt = spt.to(device)
        model = model.to(device)
        # limit max new tokens to avoid timeouts
        model.generation_config.max_new_tokens = 4096
        print("Model initialization completed!")
    
    return tokenizer, model, spt, device

# Initialize model when starting the application
initialize_model()

@spaces.GPU(duration=120)
def process_single_audio_generation(
    text_input: str,
    audio_mode: str,
    prompt_text_single: str,
    prompt_audio_single: Optional[str] = None,
    prompt_text_1: str = "",
    prompt_audio_1: Optional[str] = None,
    prompt_text_2: str = "",
    prompt_audio_2: Optional[str] = None,
    use_normalize: bool = True
) -> Tuple[Optional[str], str]:
    """
    Process single audio generation request
    
    Args:
        text_input: Text to synthesize
        prompt_text_single: Prompt text for single audio
        prompt_audio_single: Single audio file path
        prompt_text_1: Role1 text
        prompt_audio_1: Role1 audio file path
        prompt_text_2: Role2 text
        prompt_audio_2: Role2 audio file path
        use_normalize: Whether to use text normalization
    
    Returns:
        Generated audio file path and status information
    """
    try:
        # Initialize model
        tokenizer, model, spt, device = initialize_model()
        
        # Build input item
        item = {
            "text": text_input,
        }
        
        # Handle different audio input modes (mutually exclusive)
        if audio_mode == "Single":
            # Use single audio mode
            item["prompt_audio"] = prompt_audio_single
            item["prompt_text"] = prompt_text_single
        elif audio_mode == "Role" and prompt_audio_1 and prompt_audio_2:
            # Use role audio mode (requires both audio files)
            item["prompt_audio_speaker1"] = prompt_audio_1
            item["prompt_text_speaker1"] = prompt_text_1 if prompt_text_1 else ""
            item["prompt_audio_speaker2"] = prompt_audio_2
            item["prompt_text_speaker2"] = prompt_text_2 if prompt_text_2 else ""
        elif audio_mode == "Role" and prompt_audio_1:
            # Only Role 1 audio provided, treat as single audio
            print("Only Role 1 audio provided, treating as single audio.")
            item["prompt_audio"] = prompt_audio_1
            item["prompt_text"] = prompt_text_1 if prompt_text_1 else ""
        elif audio_mode == "Role" and prompt_audio_2:
            # Only Role 2 audio provided, treat as single audio
            print("Only Role 2 audio provided, treating as single audio.")
            item["prompt_audio"] = prompt_audio_2
            item["prompt_text"] = prompt_text_2 if prompt_text_2 else ""
        else:
            return None, "Error: Please select a mode and provide corresponding audio files\n- Single Audio Mode: Provide one audio file and corresponding text\n- Role Mode: Provide audio files for Role1 and Role2"
        
        # Set random seed to ensure reproducible results
        # import accelerate
        # accelerate.utils.set_seed(42)
        
        # Process batch (single item)
        actual_texts_data, audio_results = process_batch(
            batch_items=[item],
            tokenizer=tokenizer,
            model=model,
            spt=spt,
            device=device,
            system_prompt=SYSTEM_PROMPT,
            start_idx=0,
            use_normalize=use_normalize
        )
        
        # Check results
        if not audio_results or audio_results[0] is None:
            return None, "Error: Audio generation failed"
        
        audio_result = audio_results[0]
        
        # Create temporary output file
        output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
        
        # Save audio
        torchaudio.save(output_path, audio_result["audio_data"], audio_result["sample_rate"])
        
        # Build status information (using English since this is server-side output)
        status_info = f"""
✅ Generation successful!
📊 Audio Information:
   - Sample Rate: {audio_result["sample_rate"]} Hz
   - Audio Length: {audio_result["audio_data"].shape[-1] / audio_result["sample_rate"]:.2f} seconds
   - Channels: {audio_result["audio_data"].shape[0]}

📝 Text Processing Information:
   - Original Text: {actual_texts_data[0]['original_text'][:100]}...
   - Final Text: {actual_texts_data[0]['final_text'][:100]}...
   - Use Normalize: {actual_texts_data[0]['use_normalize']}
        """
        
        return output_path, status_info
        
    except Exception as e:
        import traceback
        error_msg = f"Error: Audio generation failed: {str(e)}\n\nDetails:\n{traceback.format_exc()}"
        return None, error_msg

# Create Gradio interface
def create_gradio_interface() -> gr.Blocks:
    with gr.Blocks(title="MOSS-TTSD🪐 Dialogue Generation", theme=gr.themes.Soft()) as demo:
        
        # Language selection at the top
        with gr.Row():
            language_selector = gr.Radio(
                choices=["English", "中文"],
                value="English",
                label="Language / 语言",
                info="Select interface language / 选择界面语言"
            )
        
        # Title and header (will be updated based on language)
        title_md = gr.Markdown("# MOSS-TTSD🪐 Dialogue Generation")
        github_md = gr.Markdown("### [Github](https://github.com/OpenMOSS/MOSS-TTSD)")
        
        with gr.Row():
            # Left input area
            with gr.Column(scale=1):
                script_input_md = gr.Markdown("### Script Input")
                
                text_input = gr.Textbox(
                    label="Text to Synthesize",
                    placeholder="Text to be synthesized, format: [S1]Role1 text[S2]Role2 text",
                    lines=6,
                )
                
                use_normalize_single = gr.Checkbox(
                    label="Use text normalization",
                    value=True,
                    info="Recommended to enable, improves handling of numbers, punctuation, etc."
                )
            
            # Right audio input area
            with gr.Column(scale=1):
                audio_input_mode_md = gr.Markdown("### Audio Input Mode")
                
                # Audio input mode selection
                audio_mode = gr.Radio(
                    choices=["Single", "Role"],
                    value="Single",
                    label="Select input mode",
                    info="Single Audio: Upload one audio with [S1][S2] text; Role Audio: Upload separate audio for Role1 and Role2"
                )
                
                # Single audio mode
                with gr.Group(visible=True) as single_mode_group:
                    prompt_audio_single = gr.File(
                        label="Drag and drop audio here - or - click to upload",
                        file_types=["audio"],
                        type="filepath"
                    )
                    prompt_text_single = gr.Textbox(
                        label="Prompt Text",
                        placeholder="Format: [S1]Role1 text[S2]Role2 text",
                        lines=3,
                    )
                
                # Role audio mode
                with gr.Group(visible=False) as role_mode_group:
                    with gr.Row():
                        with gr.Column():
                            role1_audio_md = gr.Markdown("**Role1 Audio**")
                            prompt_audio_1 = gr.File(
                                label="Role1 Audio File",
                                file_types=["audio"],
                                type="filepath"
                            )
                            prompt_text_1 = gr.Textbox(
                                label="Role1 Text",
                                placeholder="Role1 text content",
                                lines=2
                            )
                        
                        with gr.Column():
                            role2_audio_md = gr.Markdown("**Role2 Audio**")
                            prompt_audio_2 = gr.File(
                                label="Role2 Audio File",
                                file_types=["audio"],
                                type="filepath"
                            )
                            prompt_text_2 = gr.Textbox(
                                label="Role2 Text",
                                placeholder="Role2 text content",
                                lines=2
                            )
        
        # Generate button
        with gr.Row():
            generate_btn = gr.Button("Generate Audio", variant="primary", size="lg")
        
        # Output area
        with gr.Row():
            with gr.Column():
                output_audio = gr.Audio(label="Generated Audio", type="filepath")
                status_info = gr.Textbox(
                    label="Status Information",
                    lines=10,
                    interactive=False
                )
        
        # Examples area
        with gr.Row():
            with gr.Column():
                examples_md = gr.Markdown("### Examples")
                examples_desc_md = gr.Markdown("Click on examples below to auto-fill the form")

                role_examples = gr.Examples(
                    examples=ROLE_EXAMPLES,
                    inputs=[text_input, audio_mode, prompt_audio_1, prompt_text_1, prompt_audio_2, prompt_text_2, use_normalize_single],
                )
        
        # Event handlers
        
        # Language change event
        def update_language(lang):
            """Update interface language"""
            texts = LANGUAGES[lang]
            
            # Update demo title
            demo.title = texts["title"]
            
            return (
                gr.Markdown(f"# {texts['title']}"),  # title_md
                texts["script_input"],  # script_input_md
                gr.Textbox(
                    label=texts["text_to_synthesize"],
                    placeholder=texts["text_placeholder"],
                    lines=6,
                ),  # text_input
                gr.Checkbox(
                    label=texts["use_normalize"],
                    value=True,
                    info=texts["normalize_info"]
                ),  # use_normalize_single
                texts["audio_input_mode"],  # audio_input_mode_md
                gr.Radio(
                    choices=["Single", "Role"],
                    value="Single",
                    label=texts["select_input_mode"],
                    info=texts["mode_info"]
                ),  # audio_mode
                gr.File(
                    label=texts["drag_drop_audio"],
                    file_types=["audio"],
                    type="filepath"
                ),  # prompt_audio_single
                gr.Textbox(
                    label=texts["prompt_text"],
                    placeholder=texts["prompt_placeholder"],
                    lines=3,
                ),  # prompt_text_single
                texts["role1_audio"],  # role1_audio_md
                gr.File(
                    label=texts["role1_audio_file"],
                    file_types=["audio"],
                    type="filepath"
                ),  # prompt_audio_1
                gr.Textbox(
                    label=texts["role1_text"],
                    placeholder=texts["role1_placeholder"],
                    lines=2
                ),  # prompt_text_1
                texts["role2_audio"],  # role2_audio_md
                gr.File(
                    label=texts["role2_audio_file"],
                    file_types=["audio"],
                    type="filepath"
                ),  # prompt_audio_2
                gr.Textbox(
                    label=texts["role2_text"],
                    placeholder=texts["role2_placeholder"],
                    lines=2
                ),  # prompt_text_2
                gr.Button(texts["generate_audio"], variant="primary", size="lg"),  # generate_btn
                gr.Audio(label=texts["generated_audio"], type="filepath"),  # output_audio
                gr.Textbox(
                    label=texts["status_info"],
                    lines=10,
                    interactive=False
                ),  # status_info
                texts["examples"],  # examples_md
                texts["examples_desc"],  # examples_desc_md
                gr.Dataset(headers=texts["role_headers"])
            )
        
        language_selector.change(
            fn=update_language,
            inputs=[language_selector],
            outputs=[
                title_md, script_input_md, text_input, use_normalize_single,
                audio_input_mode_md, audio_mode, prompt_audio_single, prompt_text_single,
                role1_audio_md, prompt_audio_1, prompt_text_1,
                role2_audio_md, prompt_audio_2, prompt_text_2,
                generate_btn, output_audio, status_info,
                examples_md, examples_desc_md, role_examples.dataset,
            ]
        )
        
        # Audio mode toggle event
        def toggle_audio_mode(mode):
            if mode == "Single":
                return gr.Group(visible=True), gr.Group(visible=False)
            else:
                return gr.Group(visible=False), gr.Group(visible=True)
        
        audio_mode.change(
            fn=toggle_audio_mode,
            inputs=[audio_mode],
            outputs=[single_mode_group, role_mode_group]
        )
        
        # Audio generation event
        generate_btn.click(
            fn=process_single_audio_generation,
            inputs=[
                text_input,
                audio_mode,
                prompt_text_single,
                prompt_audio_single,
                prompt_text_1,
                prompt_audio_1,
                prompt_text_2,
                prompt_audio_2,
                use_normalize_single
            ],
            outputs=[output_audio, status_info],
            show_progress=True
        )
    
    return demo

# Main function
if __name__ == "__main__":
    demo = create_gradio_interface()
    
    # Launch interface
    demo.launch()