import spaces import gradio as gr import torch import torchaudio import tempfile import json import os from typing import Optional, Tuple from generation_utils import load_model, process_batch def load_examples_from_jsonl(): """ Load examples from examples/examples.jsonl and convert to ROLE_EXAMPLES format """ examples = [] jsonl_path = "examples/examples.jsonl" if not os.path.exists(jsonl_path): print(f"Warning: {jsonl_path} not found") return [] with open(jsonl_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() data = json.loads(line) # Extract required fields text = data.get('text', '') base_path = data.get('base_path', 'examples') # Check if this is a role-based example (has speaker1 and speaker2 audio) if 'prompt_audio_speaker1' in data and 'prompt_audio_speaker2' in data: # Role mode example audio_mode = "Role" prompt_audio_1 = os.path.join(base_path, data['prompt_audio_speaker1']) prompt_text_1 = data.get('prompt_text_speaker1', '') prompt_audio_2 = os.path.join(base_path, data['prompt_audio_speaker2']) prompt_text_2 = data.get('prompt_text_speaker2', '') use_normalize = True example = [text, audio_mode, prompt_audio_1, prompt_text_1, prompt_audio_2, prompt_text_2, use_normalize] examples.append(example) print(f"Loaded {len(examples)} examples from {jsonl_path}") return examples # Load examples from JSONL file ROLE_EXAMPLES = load_examples_from_jsonl() # Language configuration LANGUAGES = { "English": { "title": "MOSS-TTSD🪐 Dialogue Generation", "script_input": "### Script Input", "text_to_synthesize": "Text to Synthesize", "text_placeholder": "Text to be synthesized, format: [S1]Role1 text[S2]Role2 text", "use_normalize": "Use text normalization", "normalize_info": "Recommended to enable, improves handling of numbers, punctuation, etc.", "audio_input_mode": "### Audio Input Mode", "select_input_mode": "Select input mode", "mode_info": "Single Audio: Upload one audio with [S1][S2] text; Role Audio: Upload separate audio for Role1 and Role2", "drag_drop_audio": "Drag and drop audio here - or - click to upload", "prompt_text": "Prompt Text", "prompt_placeholder": "Format: [S1]Role1 text[S2]Role2 text", "role1_audio": "**Role1 Audio**", "role1_audio_file": "Role1 Audio File", "role1_text": "Role1 Text", "role1_placeholder": "Role1 text content", "role2_audio": "**Role2 Audio**", "role2_audio_file": "Role2 Audio File", "role2_text": "Role2 Text", "role2_placeholder": "Role2 text content", "generate_audio": "Generate Audio", "generated_audio": "Generated Audio", "status_info": "Status Information", "examples": "### Examples", "examples_desc": "Click on examples below to auto-fill the form", "role_headers": ["Text to Synthesize", "Input Mode", "Role1 Audio File", "Role1 Text", "Role2 Audio File", "Role2 Text", "Use Normalize"] }, "中文": { "title": "MOSS-TTSD🪐 对话语音生成", "script_input": "### 文本输入", "text_to_synthesize": "要合成的文本", "text_placeholder": "要合成的文本,格式:[S1]角色1文本[S2]角色2文本", "use_normalize": "使用文本规范化", "normalize_info": "建议启用,改善数字、标点符号等的处理", "audio_input_mode": "### 音频输入模式", "select_input_mode": "选择输入模式", "mode_info": "单音频:上传一个包含[S1][S2]文本的音频;角色音频:分别为角色1和角色2上传音频", "drag_drop_audio": "拖拽音频文件到此处 - 或 - 点击上传", "prompt_text": "提示文本", "prompt_placeholder": "格式:[S1]角色1文本[S2]角色2文本", "role1_audio": "**角色1音频**", "role1_audio_file": "角色1音频文件", "role1_text": "角色1文本", "role1_placeholder": "角色1文本内容", "role2_audio": "**角色2音频**", "role2_audio_file": "角色2音频文件", "role2_text": "角色2文本", "role2_placeholder": "角色2文本内容", "generate_audio": "生成音频", "generated_audio": "生成的音频", "status_info": "状态信息", "examples": "### 示例", "examples_desc": "点击下方示例自动填充表单", "role_headers": ["要合成的文本", "输入模式", "角色1音频文件", "角色1文本", "角色2音频文件", "角色2文本", "使用规范化"] } } # Model configuration SYSTEM_PROMPT = "You are a speech synthesizer that generates natural, realistic, and human-like conversational audio from dialogue text." MODEL_PATH = "fnlp/MOSS-TTSD-v0" SPT_CONFIG_PATH = "XY_Tokenizer/config/xy_tokenizer_config.yaml" # SPT_CHECKPOINT_PATH = "XY_Tokenizer/weights/xy_tokenizer.ckpt" MAX_CHANNELS = 8 from huggingface_hub import hf_hub_download SPT_CHECKPOINT_PATH = hf_hub_download( repo_id="fnlp/XY_Tokenizer_TTSD_V0", filename="xy_tokenizer.ckpt", cache_dir="XY_Tokenizer/weights" ) print("Checkpoint downloaded to:", SPT_CHECKPOINT_PATH) # Global variables for caching loaded models tokenizer = None model = None spt = None device = None def initialize_model(): """Initialize model (load only on first call)""" global tokenizer, model, spt, device if tokenizer is None: print("Initializing model...") device = "cuda" print(f"Using {device}") tokenizer, model, spt = load_model(MODEL_PATH, SPT_CONFIG_PATH, SPT_CHECKPOINT_PATH) spt = spt.to(device) model = model.to(device) # limit max new tokens to avoid timeouts model.generation_config.max_new_tokens = 4096 print("Model initialization completed!") return tokenizer, model, spt, device # Initialize model when starting the application initialize_model() @spaces.GPU(duration=120) def process_single_audio_generation( text_input: str, audio_mode: str, prompt_text_single: str, prompt_audio_single: Optional[str] = None, prompt_text_1: str = "", prompt_audio_1: Optional[str] = None, prompt_text_2: str = "", prompt_audio_2: Optional[str] = None, use_normalize: bool = True ) -> Tuple[Optional[str], str]: """ Process single audio generation request Args: text_input: Text to synthesize prompt_text_single: Prompt text for single audio prompt_audio_single: Single audio file path prompt_text_1: Role1 text prompt_audio_1: Role1 audio file path prompt_text_2: Role2 text prompt_audio_2: Role2 audio file path use_normalize: Whether to use text normalization Returns: Generated audio file path and status information """ try: # Initialize model tokenizer, model, spt, device = initialize_model() # Build input item item = { "text": text_input, } # Handle different audio input modes (mutually exclusive) if audio_mode == "Single": # Use single audio mode item["prompt_audio"] = prompt_audio_single item["prompt_text"] = prompt_text_single elif audio_mode == "Role" and prompt_audio_1 and prompt_audio_2: # Use role audio mode (requires both audio files) item["prompt_audio_speaker1"] = prompt_audio_1 item["prompt_text_speaker1"] = prompt_text_1 if prompt_text_1 else "" item["prompt_audio_speaker2"] = prompt_audio_2 item["prompt_text_speaker2"] = prompt_text_2 if prompt_text_2 else "" elif audio_mode == "Role" and prompt_audio_1: # Only Role 1 audio provided, treat as single audio print("Only Role 1 audio provided, treating as single audio.") item["prompt_audio"] = prompt_audio_1 item["prompt_text"] = prompt_text_1 if prompt_text_1 else "" elif audio_mode == "Role" and prompt_audio_2: # Only Role 2 audio provided, treat as single audio print("Only Role 2 audio provided, treating as single audio.") item["prompt_audio"] = prompt_audio_2 item["prompt_text"] = prompt_text_2 if prompt_text_2 else "" else: return None, "Error: Please select a mode and provide corresponding audio files\n- Single Audio Mode: Provide one audio file and corresponding text\n- Role Mode: Provide audio files for Role1 and Role2" # Set random seed to ensure reproducible results # import accelerate # accelerate.utils.set_seed(42) # Process batch (single item) actual_texts_data, audio_results = process_batch( batch_items=[item], tokenizer=tokenizer, model=model, spt=spt, device=device, system_prompt=SYSTEM_PROMPT, start_idx=0, use_normalize=use_normalize ) # Check results if not audio_results or audio_results[0] is None: return None, "Error: Audio generation failed" audio_result = audio_results[0] # Create temporary output file output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name # Save audio torchaudio.save(output_path, audio_result["audio_data"], audio_result["sample_rate"]) # Build status information (using English since this is server-side output) status_info = f""" ✅ Generation successful! 📊 Audio Information: - Sample Rate: {audio_result["sample_rate"]} Hz - Audio Length: {audio_result["audio_data"].shape[-1] / audio_result["sample_rate"]:.2f} seconds - Channels: {audio_result["audio_data"].shape[0]} 📝 Text Processing Information: - Original Text: {actual_texts_data[0]['original_text'][:100]}... - Final Text: {actual_texts_data[0]['final_text'][:100]}... - Use Normalize: {actual_texts_data[0]['use_normalize']} """ return output_path, status_info except Exception as e: import traceback error_msg = f"Error: Audio generation failed: {str(e)}\n\nDetails:\n{traceback.format_exc()}" return None, error_msg # Create Gradio interface def create_gradio_interface() -> gr.Blocks: with gr.Blocks(title="MOSS-TTSD🪐 Dialogue Generation", theme=gr.themes.Soft()) as demo: # Language selection at the top with gr.Row(): language_selector = gr.Radio( choices=["English", "中文"], value="English", label="Language / 语言", info="Select interface language / 选择界面语言" ) # Title and header (will be updated based on language) title_md = gr.Markdown("# MOSS-TTSD🪐 Dialogue Generation") github_md = gr.Markdown("### [Github](https://github.com/OpenMOSS/MOSS-TTSD)") with gr.Row(): # Left input area with gr.Column(scale=1): script_input_md = gr.Markdown("### Script Input") text_input = gr.Textbox( label="Text to Synthesize", placeholder="Text to be synthesized, format: [S1]Role1 text[S2]Role2 text", lines=6, ) use_normalize_single = gr.Checkbox( label="Use text normalization", value=True, info="Recommended to enable, improves handling of numbers, punctuation, etc." ) # Right audio input area with gr.Column(scale=1): audio_input_mode_md = gr.Markdown("### Audio Input Mode") # Audio input mode selection audio_mode = gr.Radio( choices=["Single", "Role"], value="Single", label="Select input mode", info="Single Audio: Upload one audio with [S1][S2] text; Role Audio: Upload separate audio for Role1 and Role2" ) # Single audio mode with gr.Group(visible=True) as single_mode_group: prompt_audio_single = gr.File( label="Drag and drop audio here - or - click to upload", file_types=["audio"], type="filepath" ) prompt_text_single = gr.Textbox( label="Prompt Text", placeholder="Format: [S1]Role1 text[S2]Role2 text", lines=3, ) # Role audio mode with gr.Group(visible=False) as role_mode_group: with gr.Row(): with gr.Column(): role1_audio_md = gr.Markdown("**Role1 Audio**") prompt_audio_1 = gr.File( label="Role1 Audio File", file_types=["audio"], type="filepath" ) prompt_text_1 = gr.Textbox( label="Role1 Text", placeholder="Role1 text content", lines=2 ) with gr.Column(): role2_audio_md = gr.Markdown("**Role2 Audio**") prompt_audio_2 = gr.File( label="Role2 Audio File", file_types=["audio"], type="filepath" ) prompt_text_2 = gr.Textbox( label="Role2 Text", placeholder="Role2 text content", lines=2 ) # Generate button with gr.Row(): generate_btn = gr.Button("Generate Audio", variant="primary", size="lg") # Output area with gr.Row(): with gr.Column(): output_audio = gr.Audio(label="Generated Audio", type="filepath") status_info = gr.Textbox( label="Status Information", lines=10, interactive=False ) # Examples area with gr.Row(): with gr.Column(): examples_md = gr.Markdown("### Examples") examples_desc_md = gr.Markdown("Click on examples below to auto-fill the form") role_examples = gr.Examples( examples=ROLE_EXAMPLES, inputs=[text_input, audio_mode, prompt_audio_1, prompt_text_1, prompt_audio_2, prompt_text_2, use_normalize_single], ) # Event handlers # Language change event def update_language(lang): """Update interface language""" texts = LANGUAGES[lang] # Update demo title demo.title = texts["title"] return ( gr.Markdown(f"# {texts['title']}"), # title_md texts["script_input"], # script_input_md gr.Textbox( label=texts["text_to_synthesize"], placeholder=texts["text_placeholder"], lines=6, ), # text_input gr.Checkbox( label=texts["use_normalize"], value=True, info=texts["normalize_info"] ), # use_normalize_single texts["audio_input_mode"], # audio_input_mode_md gr.Radio( choices=["Single", "Role"], value="Single", label=texts["select_input_mode"], info=texts["mode_info"] ), # audio_mode gr.File( label=texts["drag_drop_audio"], file_types=["audio"], type="filepath" ), # prompt_audio_single gr.Textbox( label=texts["prompt_text"], placeholder=texts["prompt_placeholder"], lines=3, ), # prompt_text_single texts["role1_audio"], # role1_audio_md gr.File( label=texts["role1_audio_file"], file_types=["audio"], type="filepath" ), # prompt_audio_1 gr.Textbox( label=texts["role1_text"], placeholder=texts["role1_placeholder"], lines=2 ), # prompt_text_1 texts["role2_audio"], # role2_audio_md gr.File( label=texts["role2_audio_file"], file_types=["audio"], type="filepath" ), # prompt_audio_2 gr.Textbox( label=texts["role2_text"], placeholder=texts["role2_placeholder"], lines=2 ), # prompt_text_2 gr.Button(texts["generate_audio"], variant="primary", size="lg"), # generate_btn gr.Audio(label=texts["generated_audio"], type="filepath"), # output_audio gr.Textbox( label=texts["status_info"], lines=10, interactive=False ), # status_info texts["examples"], # examples_md texts["examples_desc"], # examples_desc_md gr.Dataset(headers=texts["role_headers"]) ) language_selector.change( fn=update_language, inputs=[language_selector], outputs=[ title_md, script_input_md, text_input, use_normalize_single, audio_input_mode_md, audio_mode, prompt_audio_single, prompt_text_single, role1_audio_md, prompt_audio_1, prompt_text_1, role2_audio_md, prompt_audio_2, prompt_text_2, generate_btn, output_audio, status_info, examples_md, examples_desc_md, role_examples.dataset, ] ) # Audio mode toggle event def toggle_audio_mode(mode): if mode == "Single": return gr.Group(visible=True), gr.Group(visible=False) else: return gr.Group(visible=False), gr.Group(visible=True) audio_mode.change( fn=toggle_audio_mode, inputs=[audio_mode], outputs=[single_mode_group, role_mode_group] ) # Audio generation event generate_btn.click( fn=process_single_audio_generation, inputs=[ text_input, audio_mode, prompt_text_single, prompt_audio_single, prompt_text_1, prompt_audio_1, prompt_text_2, prompt_audio_2, use_normalize_single ], outputs=[output_audio, status_info], show_progress=True ) return demo # Main function if __name__ == "__main__": demo = create_gradio_interface() # Launch interface demo.launch()