Spaces:
Running
Running
File size: 2,997 Bytes
79fa297 3f4634d 79fa297 3f4634d 79fa297 6557227 79fa297 6557227 79fa297 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import spaces
import gradio as gr
from cached_path import cached_path
import tempfile
from f5_tts.model import DiT
from f5_tts.infer.utils_infer import (
preprocess_ref_audio_text,
load_vocoder,
load_model,
infer_process,
save_spectrogram,
)
vocoder = load_vocoder()
model = load_model(
DiT,
dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
ckpt_path=str(
cached_path("hf://whatvn/vietnamese-tts/model.tensors")
),
vocab_file=str(cached_path("hf://whatvn/vietnamese-tts/vocab.txt")),
)
@spaces.GPU
def infer(ref_audio_orig: str, gen_text: str, speed: float = 1.0):
if ref_audio_orig is None:
raise gr.Error("Reference audio is required.")
if gen_text is None or gen_text.strip() == "":
raise gr.Error("Text to generate is required.")
try:
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, "")
final_wave, final_sample_rate, combined_spectrogram = infer_process(
ref_audio,
ref_text,
gen_text,
model,
vocoder,
cross_fade_duration=0.15,
nfe_step=32,
speed=speed,
)
with tempfile.NamedTemporaryFile(
suffix=".png", delete=False
) as tmp_spectrogram:
spectrogram_path = tmp_spectrogram.name
save_spectrogram(combined_spectrogram, spectrogram_path)
return (final_sample_rate, final_wave), spectrogram_path
except Exception as e:
raise gr.Error(f"An error occurred during inference: {e}")
iface = gr.Interface(
title="Vietnamese TTS",
description="Vietnamese TTS model trained with public data (around 200 hours Vietnamese voice) using [F5-TTS](https://github.com/SWivid/F5-TTS) model",
fn=infer,
inputs=[
gr.components.Audio(type="filepath", label="Reference Audio"),
gr.components.Textbox(label="Text to Generate", lines=3),
gr.components.Slider(
label="Speed",
minimum=0.3,
maximum=2.0,
value=1.0,
step=0.1,
info="Adjust the speed of the audio.",
),
],
outputs=[
gr.components.Audio(type="numpy", label="Synthesized Audio"),
gr.components.Image(type="filepath", label="Spectrogram"),
],
submit_btn="Synthesize",
clear_btn=None,
flagging_mode="never",
examples=[
[
"examples/pc-01.wav",
"để hiểu sâu sắc một sự việc, một vấn đề, từ đó khai thác được tốt hơn quá trình hình thành nên vấn đề",
0.8,
],
[
"examples/pc-02.wav",
"có nghĩa là cảm xúc là vốn có, là tức thời, là bản năng, đối với một sự việc con người có những phản ứng hay cảm xúc khác nhau",
1.0,
],
],
)
if __name__ == "__main__":
iface.queue().launch()
|