import io import os import torch import torchaudio from TTS.api import TTS from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts from TTS.utils.generic_utils import get_user_data_dir import gradio as gr from scipy.io.wavfile import write from pydub import AudioSegment os.environ["COQUI_TOS_AGREED"] = "1" tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1") model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1") config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) model = Xtts.init_from_config(config) model.load_checkpoint( config, checkpoint_path=os.path.join(model_path, "model.pth"), vocab_path=os.path.join(model_path, "vocab.json"), eval=True, use_deepspeed=True ) model.cuda() def stream_audio(synthesis_text): gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path="female.wav") wav_chunks = [] chunks = model.inference_stream( synthesis_text, "en", gpt_cond_latent, speaker_embedding, stream_chunk_size=10, overlap_wav_len=512) for i, chunk in enumerate(chunks): print(f"Received chunk {i} of audio length {chunk.shape[-1]}") out_file = f'{i}.wav' write(out_file, 24000, chunk.detach().cpu().numpy().squeeze()) audio = AudioSegment.from_file(out_file) audio.export(out_file, format='wav') yield out_file demo = gr.Interface( fn=stream_audio, inputs=gr.Textbox(), outputs=gr.Audio(autoplay=True, streaming=True), ) if __name__ == "__main__": demo.queue().launch(debug=True)