|
import spaces |
|
import gradio as gr |
|
import torch |
|
import torchaudio |
|
import tempfile |
|
import json |
|
import os |
|
from typing import Optional, Tuple |
|
|
|
from generation_utils import load_model, process_batch |
|
|
|
def load_examples_from_jsonl(): |
|
""" |
|
Load examples from examples/examples.jsonl and convert to ROLE_EXAMPLES format |
|
""" |
|
examples = [] |
|
jsonl_path = "examples/examples.jsonl" |
|
|
|
if not os.path.exists(jsonl_path): |
|
print(f"Warning: {jsonl_path} not found") |
|
return [] |
|
|
|
with open(jsonl_path, 'r', encoding='utf-8') as f: |
|
for line in f: |
|
line = line.strip() |
|
|
|
data = json.loads(line) |
|
|
|
|
|
text = data.get('text', '') |
|
base_path = data.get('base_path', 'examples') |
|
|
|
|
|
if 'prompt_audio_speaker1' in data and 'prompt_audio_speaker2' in data: |
|
|
|
audio_mode = "Role" |
|
prompt_audio_1 = os.path.join(base_path, data['prompt_audio_speaker1']) |
|
prompt_text_1 = data.get('prompt_text_speaker1', '') |
|
prompt_audio_2 = os.path.join(base_path, data['prompt_audio_speaker2']) |
|
prompt_text_2 = data.get('prompt_text_speaker2', '') |
|
use_normalize = True |
|
|
|
example = [text, audio_mode, prompt_audio_1, prompt_text_1, prompt_audio_2, prompt_text_2, use_normalize] |
|
examples.append(example) |
|
|
|
print(f"Loaded {len(examples)} examples from {jsonl_path}") |
|
return examples |
|
|
|
|
|
ROLE_EXAMPLES = load_examples_from_jsonl() |
|
|
|
|
|
LANGUAGES = { |
|
"English": { |
|
"title": "MOSS-TTSD🪐 Dialogue Generation", |
|
"script_input": "### Script Input", |
|
"text_to_synthesize": "Text to Synthesize", |
|
"text_placeholder": "Text to be synthesized, format: [S1]Role1 text[S2]Role2 text", |
|
"use_normalize": "Use text normalization", |
|
"normalize_info": "Recommended to enable, improves handling of numbers, punctuation, etc.", |
|
"audio_input_mode": "### Audio Input Mode", |
|
"select_input_mode": "Select input mode", |
|
"mode_info": "Single Audio: Upload one audio with [S1][S2] text; Role Audio: Upload separate audio for Role1 and Role2", |
|
"drag_drop_audio": "Drag and drop audio here - or - click to upload", |
|
"prompt_text": "Prompt Text", |
|
"prompt_placeholder": "Format: [S1]Role1 text[S2]Role2 text", |
|
"role1_audio": "**Role1 Audio**", |
|
"role1_audio_file": "Role1 Audio File", |
|
"role1_text": "Role1 Text", |
|
"role1_placeholder": "Role1 text content", |
|
"role2_audio": "**Role2 Audio**", |
|
"role2_audio_file": "Role2 Audio File", |
|
"role2_text": "Role2 Text", |
|
"role2_placeholder": "Role2 text content", |
|
"generate_audio": "Generate Audio", |
|
"generated_audio": "Generated Audio", |
|
"status_info": "Status Information", |
|
"examples": "### Examples", |
|
"examples_desc": "Click on examples below to auto-fill the form", |
|
"role_headers": ["Text to Synthesize", "Input Mode", "Role1 Audio File", "Role1 Text", "Role2 Audio File", "Role2 Text", "Use Normalize"] |
|
}, |
|
"中文": { |
|
"title": "MOSS-TTSD🪐 对话语音生成", |
|
"script_input": "### 文本输入", |
|
"text_to_synthesize": "要合成的文本", |
|
"text_placeholder": "要合成的文本,格式:[S1]角色1文本[S2]角色2文本", |
|
"use_normalize": "使用文本规范化", |
|
"normalize_info": "建议启用,改善数字、标点符号等的处理", |
|
"audio_input_mode": "### 音频输入模式", |
|
"select_input_mode": "选择输入模式", |
|
"mode_info": "单音频:上传一个包含[S1][S2]文本的音频;角色音频:分别为角色1和角色2上传音频", |
|
"drag_drop_audio": "拖拽音频文件到此处 - 或 - 点击上传", |
|
"prompt_text": "提示文本", |
|
"prompt_placeholder": "格式:[S1]角色1文本[S2]角色2文本", |
|
"role1_audio": "**角色1音频**", |
|
"role1_audio_file": "角色1音频文件", |
|
"role1_text": "角色1文本", |
|
"role1_placeholder": "角色1文本内容", |
|
"role2_audio": "**角色2音频**", |
|
"role2_audio_file": "角色2音频文件", |
|
"role2_text": "角色2文本", |
|
"role2_placeholder": "角色2文本内容", |
|
"generate_audio": "生成音频", |
|
"generated_audio": "生成的音频", |
|
"status_info": "状态信息", |
|
"examples": "### 示例", |
|
"examples_desc": "点击下方示例自动填充表单", |
|
"role_headers": ["要合成的文本", "输入模式", "角色1音频文件", "角色1文本", "角色2音频文件", "角色2文本", "使用规范化"] |
|
} |
|
} |
|
|
|
|
|
SYSTEM_PROMPT = "You are a speech synthesizer that generates natural, realistic, and human-like conversational audio from dialogue text." |
|
MODEL_PATH = "fnlp/MOSS-TTSD-v0" |
|
SPT_CONFIG_PATH = "XY_Tokenizer/config/xy_tokenizer_config.yaml" |
|
|
|
MAX_CHANNELS = 8 |
|
|
|
from huggingface_hub import hf_hub_download |
|
|
|
SPT_CHECKPOINT_PATH = hf_hub_download( |
|
repo_id="fnlp/XY_Tokenizer_TTSD_V0", |
|
filename="xy_tokenizer.ckpt", |
|
cache_dir="XY_Tokenizer/weights" |
|
) |
|
|
|
print("Checkpoint downloaded to:", SPT_CHECKPOINT_PATH) |
|
|
|
|
|
tokenizer = None |
|
model = None |
|
spt = None |
|
device = None |
|
|
|
def initialize_model(): |
|
"""Initialize model (load only on first call)""" |
|
global tokenizer, model, spt, device |
|
|
|
if tokenizer is None: |
|
print("Initializing model...") |
|
device = "cuda" |
|
print(f"Using {device}") |
|
tokenizer, model, spt = load_model(MODEL_PATH, SPT_CONFIG_PATH, SPT_CHECKPOINT_PATH) |
|
spt = spt.to(device) |
|
model = model.to(device) |
|
|
|
model.generation_config.max_new_tokens = 4096 |
|
print("Model initialization completed!") |
|
|
|
return tokenizer, model, spt, device |
|
|
|
|
|
initialize_model() |
|
|
|
@spaces.GPU(duration=120) |
|
def process_single_audio_generation( |
|
text_input: str, |
|
audio_mode: str, |
|
prompt_text_single: str, |
|
prompt_audio_single: Optional[str] = None, |
|
prompt_text_1: str = "", |
|
prompt_audio_1: Optional[str] = None, |
|
prompt_text_2: str = "", |
|
prompt_audio_2: Optional[str] = None, |
|
use_normalize: bool = True |
|
) -> Tuple[Optional[str], str]: |
|
""" |
|
Process single audio generation request |
|
|
|
Args: |
|
text_input: Text to synthesize |
|
prompt_text_single: Prompt text for single audio |
|
prompt_audio_single: Single audio file path |
|
prompt_text_1: Role1 text |
|
prompt_audio_1: Role1 audio file path |
|
prompt_text_2: Role2 text |
|
prompt_audio_2: Role2 audio file path |
|
use_normalize: Whether to use text normalization |
|
|
|
Returns: |
|
Generated audio file path and status information |
|
""" |
|
try: |
|
|
|
tokenizer, model, spt, device = initialize_model() |
|
|
|
|
|
item = { |
|
"text": text_input, |
|
} |
|
|
|
|
|
if audio_mode == "Single": |
|
|
|
item["prompt_audio"] = prompt_audio_single |
|
item["prompt_text"] = prompt_text_single |
|
elif audio_mode == "Role" and prompt_audio_1 and prompt_audio_2: |
|
|
|
item["prompt_audio_speaker1"] = prompt_audio_1 |
|
item["prompt_text_speaker1"] = prompt_text_1 if prompt_text_1 else "" |
|
item["prompt_audio_speaker2"] = prompt_audio_2 |
|
item["prompt_text_speaker2"] = prompt_text_2 if prompt_text_2 else "" |
|
elif audio_mode == "Role" and prompt_audio_1: |
|
|
|
print("Only Role 1 audio provided, treating as single audio.") |
|
item["prompt_audio"] = prompt_audio_1 |
|
item["prompt_text"] = prompt_text_1 if prompt_text_1 else "" |
|
elif audio_mode == "Role" and prompt_audio_2: |
|
|
|
print("Only Role 2 audio provided, treating as single audio.") |
|
item["prompt_audio"] = prompt_audio_2 |
|
item["prompt_text"] = prompt_text_2 if prompt_text_2 else "" |
|
else: |
|
return None, "Error: Please select a mode and provide corresponding audio files\n- Single Audio Mode: Provide one audio file and corresponding text\n- Role Mode: Provide audio files for Role1 and Role2" |
|
|
|
|
|
|
|
|
|
|
|
|
|
actual_texts_data, audio_results = process_batch( |
|
batch_items=[item], |
|
tokenizer=tokenizer, |
|
model=model, |
|
spt=spt, |
|
device=device, |
|
system_prompt=SYSTEM_PROMPT, |
|
start_idx=0, |
|
use_normalize=use_normalize |
|
) |
|
|
|
|
|
if not audio_results or audio_results[0] is None: |
|
return None, "Error: Audio generation failed" |
|
|
|
audio_result = audio_results[0] |
|
|
|
|
|
output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name |
|
|
|
|
|
torchaudio.save(output_path, audio_result["audio_data"], audio_result["sample_rate"]) |
|
|
|
|
|
status_info = f""" |
|
✅ Generation successful! |
|
📊 Audio Information: |
|
- Sample Rate: {audio_result["sample_rate"]} Hz |
|
- Audio Length: {audio_result["audio_data"].shape[-1] / audio_result["sample_rate"]:.2f} seconds |
|
- Channels: {audio_result["audio_data"].shape[0]} |
|
|
|
📝 Text Processing Information: |
|
- Original Text: {actual_texts_data[0]['original_text'][:100]}... |
|
- Final Text: {actual_texts_data[0]['final_text'][:100]}... |
|
- Use Normalize: {actual_texts_data[0]['use_normalize']} |
|
""" |
|
|
|
return output_path, status_info |
|
|
|
except Exception as e: |
|
import traceback |
|
error_msg = f"Error: Audio generation failed: {str(e)}\n\nDetails:\n{traceback.format_exc()}" |
|
return None, error_msg |
|
|
|
|
|
def create_gradio_interface() -> gr.Blocks: |
|
with gr.Blocks(title="MOSS-TTSD🪐 Dialogue Generation", theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
with gr.Row(): |
|
language_selector = gr.Radio( |
|
choices=["English", "中文"], |
|
value="English", |
|
label="Language / 语言", |
|
info="Select interface language / 选择界面语言" |
|
) |
|
|
|
|
|
title_md = gr.Markdown("# MOSS-TTSD🪐 Dialogue Generation") |
|
github_md = gr.Markdown("### [Github](https://github.com/OpenMOSS/MOSS-TTSD)") |
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(scale=1): |
|
script_input_md = gr.Markdown("### Script Input") |
|
|
|
text_input = gr.Textbox( |
|
label="Text to Synthesize", |
|
placeholder="Text to be synthesized, format: [S1]Role1 text[S2]Role2 text", |
|
lines=6, |
|
) |
|
|
|
use_normalize_single = gr.Checkbox( |
|
label="Use text normalization", |
|
value=True, |
|
info="Recommended to enable, improves handling of numbers, punctuation, etc." |
|
) |
|
|
|
|
|
with gr.Column(scale=1): |
|
audio_input_mode_md = gr.Markdown("### Audio Input Mode") |
|
|
|
|
|
audio_mode = gr.Radio( |
|
choices=["Single", "Role"], |
|
value="Single", |
|
label="Select input mode", |
|
info="Single Audio: Upload one audio with [S1][S2] text; Role Audio: Upload separate audio for Role1 and Role2" |
|
) |
|
|
|
|
|
with gr.Group(visible=True) as single_mode_group: |
|
prompt_audio_single = gr.File( |
|
label="Drag and drop audio here - or - click to upload", |
|
file_types=["audio"], |
|
type="filepath" |
|
) |
|
prompt_text_single = gr.Textbox( |
|
label="Prompt Text", |
|
placeholder="Format: [S1]Role1 text[S2]Role2 text", |
|
lines=3, |
|
) |
|
|
|
|
|
with gr.Group(visible=False) as role_mode_group: |
|
with gr.Row(): |
|
with gr.Column(): |
|
role1_audio_md = gr.Markdown("**Role1 Audio**") |
|
prompt_audio_1 = gr.File( |
|
label="Role1 Audio File", |
|
file_types=["audio"], |
|
type="filepath" |
|
) |
|
prompt_text_1 = gr.Textbox( |
|
label="Role1 Text", |
|
placeholder="Role1 text content", |
|
lines=2 |
|
) |
|
|
|
with gr.Column(): |
|
role2_audio_md = gr.Markdown("**Role2 Audio**") |
|
prompt_audio_2 = gr.File( |
|
label="Role2 Audio File", |
|
file_types=["audio"], |
|
type="filepath" |
|
) |
|
prompt_text_2 = gr.Textbox( |
|
label="Role2 Text", |
|
placeholder="Role2 text content", |
|
lines=2 |
|
) |
|
|
|
|
|
with gr.Row(): |
|
generate_btn = gr.Button("Generate Audio", variant="primary", size="lg") |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
output_audio = gr.Audio(label="Generated Audio", type="filepath") |
|
status_info = gr.Textbox( |
|
label="Status Information", |
|
lines=10, |
|
interactive=False |
|
) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
examples_md = gr.Markdown("### Examples") |
|
examples_desc_md = gr.Markdown("Click on examples below to auto-fill the form") |
|
|
|
role_examples = gr.Examples( |
|
examples=ROLE_EXAMPLES, |
|
inputs=[text_input, audio_mode, prompt_audio_1, prompt_text_1, prompt_audio_2, prompt_text_2, use_normalize_single], |
|
) |
|
|
|
|
|
|
|
|
|
def update_language(lang): |
|
"""Update interface language""" |
|
texts = LANGUAGES[lang] |
|
|
|
|
|
demo.title = texts["title"] |
|
|
|
return ( |
|
gr.Markdown(f"# {texts['title']}"), |
|
texts["script_input"], |
|
gr.Textbox( |
|
label=texts["text_to_synthesize"], |
|
placeholder=texts["text_placeholder"], |
|
lines=6, |
|
), |
|
gr.Checkbox( |
|
label=texts["use_normalize"], |
|
value=True, |
|
info=texts["normalize_info"] |
|
), |
|
texts["audio_input_mode"], |
|
gr.Radio( |
|
choices=["Single", "Role"], |
|
value="Single", |
|
label=texts["select_input_mode"], |
|
info=texts["mode_info"] |
|
), |
|
gr.File( |
|
label=texts["drag_drop_audio"], |
|
file_types=["audio"], |
|
type="filepath" |
|
), |
|
gr.Textbox( |
|
label=texts["prompt_text"], |
|
placeholder=texts["prompt_placeholder"], |
|
lines=3, |
|
), |
|
texts["role1_audio"], |
|
gr.File( |
|
label=texts["role1_audio_file"], |
|
file_types=["audio"], |
|
type="filepath" |
|
), |
|
gr.Textbox( |
|
label=texts["role1_text"], |
|
placeholder=texts["role1_placeholder"], |
|
lines=2 |
|
), |
|
texts["role2_audio"], |
|
gr.File( |
|
label=texts["role2_audio_file"], |
|
file_types=["audio"], |
|
type="filepath" |
|
), |
|
gr.Textbox( |
|
label=texts["role2_text"], |
|
placeholder=texts["role2_placeholder"], |
|
lines=2 |
|
), |
|
gr.Button(texts["generate_audio"], variant="primary", size="lg"), |
|
gr.Audio(label=texts["generated_audio"], type="filepath"), |
|
gr.Textbox( |
|
label=texts["status_info"], |
|
lines=10, |
|
interactive=False |
|
), |
|
texts["examples"], |
|
texts["examples_desc"], |
|
gr.Dataset(headers=texts["role_headers"]) |
|
) |
|
|
|
language_selector.change( |
|
fn=update_language, |
|
inputs=[language_selector], |
|
outputs=[ |
|
title_md, script_input_md, text_input, use_normalize_single, |
|
audio_input_mode_md, audio_mode, prompt_audio_single, prompt_text_single, |
|
role1_audio_md, prompt_audio_1, prompt_text_1, |
|
role2_audio_md, prompt_audio_2, prompt_text_2, |
|
generate_btn, output_audio, status_info, |
|
examples_md, examples_desc_md, role_examples.dataset, |
|
] |
|
) |
|
|
|
|
|
def toggle_audio_mode(mode): |
|
if mode == "Single": |
|
return gr.Group(visible=True), gr.Group(visible=False) |
|
else: |
|
return gr.Group(visible=False), gr.Group(visible=True) |
|
|
|
audio_mode.change( |
|
fn=toggle_audio_mode, |
|
inputs=[audio_mode], |
|
outputs=[single_mode_group, role_mode_group] |
|
) |
|
|
|
|
|
generate_btn.click( |
|
fn=process_single_audio_generation, |
|
inputs=[ |
|
text_input, |
|
audio_mode, |
|
prompt_text_single, |
|
prompt_audio_single, |
|
prompt_text_1, |
|
prompt_audio_1, |
|
prompt_text_2, |
|
prompt_audio_2, |
|
use_normalize_single |
|
], |
|
outputs=[output_audio, status_info], |
|
show_progress=True |
|
) |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
demo = create_gradio_interface() |
|
|
|
|
|
demo.launch() |
|
|