Spaces:

TaiYouWeb
/

tts-xtts2-multi

Runtime error

File size: 4,228 Bytes

5ca847f
 
 
db3663c
5ca847f
 
aa93b1b
5ca847f
 
db3663c
5ca847f
 
 
 
 
aa93b1b
 
 
 
 
 
 
 
 
5ca847f
 
aa93b1b
5ca847f
 
 
db3663c
 
 
 
 
 
 
51c71fc
db3663c
 
 
 
 
5ca847f
 
 
 
 
51c71fc
 
5ca847f
 
 
 
 
 
 
db3663c
5ca847f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db3663c
5ca847f
db3663c
 
5ca847f
 
 
db3663c
 
 
 
 
 
783ad44
db3663c
5a12fa3
db3663c
5a12fa3
 
db3663c

import io
import os
import tempfile
from typing import List

import TTS.api
import TTS.utils.manage as manage
import torch
from pydub import AudioSegment
import gradio as gr  # Gradio库

import config

device = "cuda" if torch.cuda.is_available() else "cpu"

# 定义一个函数来自动接受许可条款
def ask_tos_patch(self, output_path):
    print("Automatically accepting the terms of service.")
    return True

# 使用我们定义的函数替换原有的 ask_tos 方法
manage.ModelManager.ask_tos = ask_tos_patch
tts = TTS.api.TTS()

models = {}
for id, model in config.models.items():
    tts.download_model_by_name(model)
    models[id] = TTS.api.TTS(model).to(device)


def synthesize_tts(
    text: str = 'Hello, World!',
    speaker_wavs: List[gr.File] = None,
    speaker_idx: str = 'Ana Florence',
    language: str = 'ja',
    temperature: float = 0.65,
    length_penalty: float = 1.0,
    repetition_penalty: int = 2.0,
    top_k: int = 50,
    top_p: float = 0.8,
    speed: float = 1.0,
    enable_text_splitting: bool = True,
):
    temp_files = []
    try:
        if speaker_wavs:
            # Process each uploaded file
            for speaker_wav in speaker_wavs:
                with open(speaker_wav.name, "rb") as f:
                    speaker_wav_bytes = f.read()
                # Convert the uploaded audio file to a WAV format using pydub
                try:
                    audio = AudioSegment.from_file(io.BytesIO(speaker_wav_bytes))
                    wav_buffer = io.BytesIO()
                    audio.export(wav_buffer, format="wav")
                    wav_buffer.seek(0)  # Reset buffer position to the beginning
                except Exception as e:
                    return f"Error processing audio file: {e}"

                temp_wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
                temp_wav_file.write(wav_buffer.read())
                temp_wav_file.close()
                temp_files.append(temp_wav_file.name)

        output_buffer = io.BytesIO()
        if temp_files:
            models['multi'].tts_to_file(
                text=text,
                speaker_wav=temp_files, 
                language=language,
                file_path=output_buffer, 
                temperature=temperature,
                length_penalty=length_penalty,
                repetition_penalty=repetition_penalty,
                top_k=top_k,
                top_p=top_p,
                speed=speed,
                enable_text_splitting=enable_text_splitting
            )
        else:
            models['multi'].tts_to_file(
                text=text,
                speaker=speaker_idx, 
                language=language,
                file_path=output_buffer, 
                temperature=temperature,
                length_penalty=length_penalty,
                repetition_penalty=repetition_penalty,
                top_k=top_k,
                top_p=top_p,
                speed=speed,
                enable_text_splitting=enable_text_splitting
            )
        
        output_buffer.seek(0)
        return output_buffer.read()

    finally:
        for temp_file in temp_files:
            if isinstance(temp_file, str) and os.path.exists(temp_file):
                os.remove(temp_file)


# 创建Gradio界面
inputs = [
    gr.Textbox(value="Hello, World!", label="Text to Synthesize"),
    gr.File(file_types=["audio"], label="Speaker WAV files (optional)", file_count="multiple"),
    gr.Textbox(value="Ana Florence", label="Speaker Index"),
    gr.Textbox(value="en", label="Language"),
    gr.Slider(0, 1, value=0.65, step=0.01, label="Temperature"),
    gr.Slider(0.5, 2, value=1.0, step=0.1, label="Length Penalty"),
    gr.Slider(1.0, 10.0, value=2.0, step=0.1, label="Repetition Penalty"),
    gr.Slider(1, 100, value=50, step=1, label="Top-K"),
    gr.Slider(0, 1, value=0.8, step=0.01, label="Top-P"),
    gr.Slider(0.5, 2, value=1.0, step=0.01, label="Speed"),
    gr.Checkbox(value=True, label="Enable Text Splitting")
]

outputs = gr.Audio(label="Generated Speech")

gr.Interface(
    fn=synthesize_tts, 
    inputs=inputs, 
    outputs=outputs, 
    title="Text-to-Speech Synthesis with Gradio"
).launch()