Spaces:

whatvn
/

vietnamese-tts

Running on Zero

vietnamese-tts / app.py

Hưng

clearer description

efe0cff 2 months ago

3.54 kB

	import spaces
	import gradio as gr
	from cached_path import cached_path
	import tempfile

	from f5_tts.model import DiT
	from f5_tts.infer.utils_infer import (
	preprocess_ref_audio_text,
	load_vocoder,
	load_model,
	infer_process,
	save_spectrogram,
	)


	vocoder = load_vocoder()
	model = load_model(
	DiT,
	dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
	ckpt_path=str(
	cached_path("hf://whatvn/vietnamese-tts/model.tensors")
	),
	vocab_file=str(cached_path("hf://whatvn/vietnamese-tts/vocab.txt")),
	)


	@spaces.GPU
	def infer(ref_audio_orig: str, ref_text: str, gen_text: str, speed: float = 1.0):
	if ref_audio_orig is None:
	raise gr.Error("Reference audio is required.")

	if gen_text is None or gen_text.strip() == "":
	raise gr.Error("Text to generate is required.")

	if ref_text is None or ref_text.strip() == "":
	raise gr.Error("Ref Text is required.")

	try:
	ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text)
	final_wave, final_sample_rate, combined_spectrogram = infer_process(
	ref_audio,
	ref_text,
	gen_text,
	model,
	vocoder,
	cross_fade_duration=0.15,
	nfe_step=32,
	speed=speed,
	)

	with tempfile.NamedTemporaryFile(
	suffix=".png", delete=False
	) as tmp_spectrogram:
	spectrogram_path = tmp_spectrogram.name
	save_spectrogram(combined_spectrogram, spectrogram_path)

	return (final_sample_rate, final_wave), spectrogram_path
	except Exception as e:
	raise gr.Error(f"An error occurred during inference: {e}")


	iface = gr.Interface(
	title="Vietnamese TTS",
	description="Vietnamese TTS model trained with public data (around 200 hours Vietnamese voice) using [F5-TTS](https://github.com/SWivid/F5-TTS) model. The model is published at https://huggingface.co/whatvn/vietnamese-tts",
	fn=infer,
	inputs=[
	gr.components.Audio(type="filepath", label="Reference Audio"),
	gr.components.Textbox(label="Reference audio text", lines=3),
	gr.components.Textbox(label="Text to Generate", lines=3),
	gr.components.Slider(
	label="Speed",
	minimum=0.3,
	maximum=2.0,
	value=1.0,
	step=0.1,
	info="Adjust the speed of the audio.",
	),
	],
	outputs=[
	gr.components.Audio(type="numpy", label="Synthesized Audio"),
	gr.components.Image(type="filepath", label="Spectrogram"),
	],
	submit_btn="Synthesize",
	clear_btn=None,
	flagging_mode="never",
	examples=[
	[
	"examples/pc-01.wav",
	"để hiểu sâu sắc một sự việc, một vấn đề, từ đó khai thác được tốt hơn quá trình hình thành nên vấn đề",
	"Bà Lê Lan Chi, Tổng giám đốc Zalopay cho biết ứng dụng đã có một năm nhiều dấu ấn khi đồng hành với hàng triệu người dùng",
	1.0,
	],
	[
	"examples/pc-02.wav",
	"có nghĩa là cảm xúc là vốn có, là tức thời, là bản năng, đối với một sự việc con người có những phản ứng hay cảm xúc khác nhau",
	"Bạn đã nhận được thanh toán thành công số tiền ba mươi ngàn đồng",
	1.0,
	],
	],
	)

	if __name__ == "__main__":
	iface.queue().launch()