moe-tts / app.py
skytnt's picture
gradio 5.0.1
eb91658
raw
history blame
14.7 kB
import argparse
import json
import os
import re
import gradio as gr
import librosa
import numpy as np
import torch
from torch import no_grad, LongTensor
import commons
import utils
from mel_processing import spectrogram_torch
from models import SynthesizerTrn
from text import text_to_sequence, _clean_text
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
def get_text(text, hps, is_symbol):
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = LongTensor(text_norm)
return text_norm
def create_tts_fn(model, hps, speaker_ids):
def tts_fn(text, speaker, speed, is_symbol):
if limitation:
text_len = len(re.sub("\[([A-Z]{2})\]", "", text))
max_len = 150
if is_symbol:
max_len *= 3
if text_len > max_len:
return "Error: Text is too long", None
speaker_id = speaker_ids[speaker]
stn_tst = get_text(text, hps, is_symbol)
with no_grad():
x_tst = stn_tst.unsqueeze(0).to(device)
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
sid = LongTensor([speaker_id]).to(device)
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
del stn_tst, x_tst, x_tst_lengths, sid
return "Success", (hps.data.sampling_rate, audio)
return tts_fn
def create_vc_fn(model, hps, speaker_ids):
def vc_fn(original_speaker, target_speaker, input_audio):
if input_audio is None:
return "You need to upload an audio", None
sampling_rate, audio = input_audio
original_speaker_id = speaker_ids[original_speaker]
target_speaker_id = speaker_ids[target_speaker]
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != hps.data.sampling_rate:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
with no_grad():
y = torch.FloatTensor(audio)
y = y.unsqueeze(0)
spec = spectrogram_torch(y, hps.data.filter_length,
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
center=False).to(device)
spec_lengths = LongTensor([spec.size(-1)]).to(device)
sid_src = LongTensor([original_speaker_id]).to(device)
sid_tgt = LongTensor([target_speaker_id]).to(device)
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
0, 0].data.cpu().float().numpy()
del y, spec, spec_lengths, sid_src, sid_tgt
return "Success", (hps.data.sampling_rate, audio)
return vc_fn
def create_soft_vc_fn(model, hps, speaker_ids):
def soft_vc_fn(target_speaker, input_audio):
if input_audio is None:
return "You need to upload an audio", None
sampling_rate, audio = input_audio
target_speaker_id = speaker_ids[target_speaker]
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
with torch.inference_mode():
units = hubert.units(torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0).to(device))
with no_grad():
unit_lengths = LongTensor([units.size(1)]).to(device)
sid = LongTensor([target_speaker_id]).to(device)
audio = model.infer(units, unit_lengths, sid=sid, noise_scale=.667,
noise_scale_w=0.8)[0][0, 0].data.cpu().float().numpy()
del units, unit_lengths, sid
return "Success", (hps.data.sampling_rate, audio)
return soft_vc_fn
def create_to_symbol_fn(hps):
def to_symbol_fn(is_symbol_input, input_text, temp_text):
return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
else (temp_text, temp_text)
return to_symbol_fn
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--device', type=str, default='cpu')
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
args = parser.parse_args()
device = torch.device(args.device)
models_tts = []
models_vc = []
models_soft_vc = []
with open("saved_model/info.json", "r", encoding="utf-8") as f:
models_info = json.load(f)
for i, info in models_info.items():
name = info["title"]
author = info["author"]
lang = info["lang"]
example = info["example"]
config_path = f"saved_model/{i}/config.json"
model_path = f"saved_model/{i}/model.pth"
cover = info["cover"]
cover_path = f"saved_model/{i}/{cover}" if cover else None
hps = utils.get_hparams_from_file(config_path)
model = SynthesizerTrn(
len(hps.symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model)
utils.load_checkpoint(model_path, model, None)
model.eval().to(device)
if isinstance(hps.speakers, utils.HParams):
speakers, speaker_ids = zip(*hps.speakers.items())
else:
speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
t = info["type"]
if t == "vits":
models_tts.append((name, author, cover_path, speakers, lang, example,
hps.symbols, create_tts_fn(model, hps, speaker_ids),
create_to_symbol_fn(hps)))
models_vc.append((name, author, cover_path, speakers, create_vc_fn(model, hps, speaker_ids)))
elif t == "soft-vits-vc":
models_soft_vc.append((name, author, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).to(device)
app = gr.Blocks()
with app:
gr.Markdown("# Moe TTS And Voice Conversion Using VITS Model\n\n"
"![visitor badge](https://api.visitorbadge.io/api/visitors?path=skytnt.moe-tts&countColor=%23263759&style=flat&labelStyle=lower)\n\n"
"[Open In Colab]"
"(https://colab.research.google.com/drive/14Pb8lpmwZL-JI5Ub6jpG4sz2-8KS0kbS?usp=sharing)"
" without queue and length limitation.\n\n"
"Feel free to [open discussion](https://huggingface.co/spaces/skytnt/moe-tts/discussions/new) "
"if you want to add your model to this app.")
with gr.Tabs():
with gr.TabItem("TTS"):
with gr.Tabs():
for i, (name, author, cover_path, speakers, lang, example, symbols, tts_fn,
to_symbol_fn) in enumerate(models_tts):
with gr.TabItem(f"model{i}"):
with gr.Column():
cover_markdown = f"![cover](gradio_api/file={cover_path})\n\n" if cover_path else ""
gr.Markdown(f"## {name}\n\n"
f"{cover_markdown}"
f"model author: {author}\n\n"
f"language: {lang}")
tts_input1 = gr.TextArea(label="Text (150 chars limitation)", value=example,
elem_id=f"tts-input{i}")
tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
type="index", value=speakers[0])
tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
with gr.Accordion(label="Advanced Options", open=False):
temp_text_var = gr.State()
symbol_input = gr.Checkbox(value=False, label="Symbol input")
symbol_list = gr.Dataset(label="Symbol list", components=[tts_input1],
samples=[[x] for x in symbols],
elem_id=f"symbol-list{i}")
symbol_list_state = gr.State(value=symbols)
tts_submit = gr.Button("Generate", variant="primary")
tts_output1 = gr.Textbox(label="Output Message")
tts_output2 = gr.Audio(label="Output Audio", elem_id=f"tts-audio{i}")
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, symbol_input],
[tts_output1, tts_output2], concurrency_limit=3)
symbol_input.change(to_symbol_fn,
[symbol_input, tts_input1, temp_text_var],
[tts_input1, temp_text_var])
symbol_list.click(None, [symbol_list, symbol_list_state], [],
js=f"""
(i,symbols) => {{
let root = document.querySelector("body > gradio-app");
if (root.shadowRoot != null)
root = root.shadowRoot;
let text_input = root.querySelector("#tts-input{i}").querySelector("textarea");
let startPos = text_input.selectionStart;
let endPos = text_input.selectionEnd;
let oldTxt = text_input.value;
console.log(i, symbols, symbols[i])
let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
text_input.value = result;
let x = window.scrollX, y = window.scrollY;
text_input.focus();
text_input.selectionStart = startPos + symbols[i].length;
text_input.selectionEnd = startPos + symbols[i].length;
text_input.blur();
window.scrollTo(x, y);
return [];
}}""")
with gr.TabItem("Voice Conversion"):
with gr.Tabs():
for i, (name, author, cover_path, speakers, vc_fn) in enumerate(models_vc):
with gr.TabItem(f"model{i}"):
cover_markdown = f"![cover](gradio_api/file={cover_path})\n\n" if cover_path else ""
gr.Markdown(f"## {name}\n\n"
f"{cover_markdown}"
f"model author: {author}")
vc_input1 = gr.Dropdown(label="Original Speaker", choices=speakers, type="index",
value=speakers[0])
vc_input2 = gr.Dropdown(label="Target Speaker", choices=speakers, type="index",
value=speakers[min(len(speakers) - 1, 1)])
vc_input3 = gr.Audio(label="Input Audio",
max_length=30 if limitation else None)
vc_submit = gr.Button("Convert", variant="primary")
vc_output1 = gr.Textbox(label="Output Message")
vc_output2 = gr.Audio(label="Output Audio", elem_id=f"vc-audio{i}")
vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2],
concurrency_limit=3)
with gr.TabItem("Soft Voice Conversion"):
with gr.Tabs():
for i, (name, author, cover_path, speakers, soft_vc_fn) in enumerate(models_soft_vc):
with gr.TabItem(f"model{i}"):
cover_markdown = f"![cover](gradio_api/file={cover_path})\n\n" if cover_path else ""
gr.Markdown(f"## {name}\n\n"
f"{cover_markdown}"
f"model author: {author}")
vc_input1 = gr.Dropdown(label="Target Speaker", choices=speakers, type="index",
value=speakers[0])
vc_input2 = gr.Audio(label="Input Audio",
max_length=30 if limitation else None)
vc_submit = gr.Button("Convert", variant="primary")
vc_output1 = gr.Textbox(label="Output Message")
vc_output2 = gr.Audio(label="Output Audio", elem_id=f"svc-audio{i}")
vc_submit.click(soft_vc_fn, [vc_input1, vc_input2],
[vc_output1, vc_output2], concurrency_limit=3)
gr.Markdown(
"unofficial demo for \n\n"
"- [https://github.com/CjangCjengh/MoeGoe](https://github.com/CjangCjengh/MoeGoe)\n"
"- [https://github.com/Francis-Komizu/VITS](https://github.com/Francis-Komizu/VITS)\n"
"- [https://github.com/luoyily/MoeTTS](https://github.com/luoyily/MoeTTS)\n"
"- [https://github.com/Francis-Komizu/Sovits](https://github.com/Francis-Komizu/Sovits)"
)
app.launch(show_api=False, share=args.share, allowed_paths=["./saved_model"])