Spaces:

Hassan-16
/

TTS

Running

App Files Files Community

TTS / app.py

Hassan-16

Update app.py

976f3b9 verified 2 months ago

raw

history blame

8.13 kB

	from kokoro import KModel, KPipeline
	import gradio as gr
	import os
	import torch
	import logging
	import soundfile as sf

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Configuration
	VOICE_DIR = os.path.join(os.path.dirname(__file__), "voices")
	OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "output_audio")
	TEXT = "Hello, this is a test of the Kokoro TTS system."

	# Ensure directories exist
	os.makedirs(VOICE_DIR, exist_ok=True)
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# Device setup
	CUDA_AVAILABLE = torch.cuda.is_available()
	device = "cuda" if CUDA_AVAILABLE else "cpu"
	logger.info(f"Using hardware: {device}")

	# Load models for CPU and GPU (if available)
	models = {gpu: KModel("hexgrad/Kokoro-82M").to("cuda" if gpu else "cpu").eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}

	# Define pipelines for American ('a') and British ('b') English
	pipelines = {
	'a': KPipeline(model=models[False], lang_code='a', device='cpu'), # American English
	'b': KPipeline(model=models[False], lang_code='b', device='cpu') # British English
	}

	# Set custom pronunciations for "kokoro" in both American and British modes
	try:
	pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
	pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"
	except AttributeError as e:
	logger.warning(f"Could not set custom pronunciations: {e}")

	def forward_gpu(text, voice_path, speed):
	pipeline = pipelines[voice_path[0]]
	pipeline.model = models[True] # Switch to GPU model
	generator = pipeline(text, voice=voice_path, speed=speed)
	for _, _, audio in generator:
	return audio
	return None

	def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
	voice_path = os.path.join(VOICE_DIR, voice)
	if not os.path.exists(voice_path):
	raise FileNotFoundError(f"Voice file not found: {voice_path}")

	pipeline = pipelines[voice[0]]
	use_gpu = use_gpu and CUDA_AVAILABLE
	try:
	if use_gpu:
	audio = forward_gpu(text, voice_path, speed)
	else:
	pipeline.model = models[False] # Ensure CPU model is used
	generator = pipeline(text, voice=voice_path, speed=speed)
	for _, ps, audio in generator:
	return (24000, audio.numpy()), ps
	except gr.exceptions.Error as e:
	if use_gpu:
	gr.Warning(str(e))
	gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.")
	pipeline.model = models[False] # Switch to CPU model
	generator = pipeline(text, voice=voice_path, speed=speed)
	for _, ps, audio in generator:
	return (24000, audio.numpy()), ps
	else:
	raise gr.Error(e)
	return None, ""

	def predict(text, voice="af_bella.pt", speed=1):
	return generate_first(text, voice, speed, use_gpu=False)[0]

	def tokenize_first(text, voice="af_bella.pt"):
	voice_path = os.path.join(VOICE_DIR, voice)
	if not os.path.exists(voice_path):
	raise FileNotFoundError(f"Voice file not found: {voice_path}")

	pipeline = pipelines[voice[0]]
	generator = pipeline(text, voice=voice_path)
	for _, ps, _ in generator:
	return ps
	return ""

	def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
	voice_path = os.path.join(VOICE_DIR, voice)
	if not os.path.exists(voice_path):
	raise FileNotFoundError(f"Voice file not found: {voice_path}")

	pipeline = pipelines[voice[0]]
	use_gpu = use_gpu and CUDA_AVAILABLE
	first = True
	if use_gpu:
	pipeline.model = models[True] # Switch to GPU model
	else:
	pipeline.model = models[False] # Switch to CPU model
	generator = pipeline(text, voice=voice_path, speed=speed)
	for _, _, audio in generator:
	yield 24000, audio.numpy()
	if first:
	first = False
	yield 24000, torch.zeros(1).numpy()

	# Dynamically load all .pt voice files from VOICE_DIR
	def load_voice_choices():
	voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
	choices = {}
	for voice_file in voice_files:
	prefix = voice_file[:2]
	if prefix == 'af':
	label = f"🇺🇸 🚺 {voice_file[3:-3].capitalize()}"
	elif prefix == 'am':
	label = f"🇺🇸 🚹 {voice_file[3:-3].capitalize()}"
	elif prefix == 'bf':
	label = f"🇬🇧 🚺 {voice_file[3:-3].capitalize()}"
	elif prefix == 'bm':
	label = f"🇬🇧 🚹 {voice_file[3:-3].capitalize()}"
	else:
	label = f"Unknown {voice_file[:-3]}"
	choices[label] = voice_file
	return choices

	CHOICES = load_voice_choices()

	# Log available voices
	for label, voice_path in CHOICES.items():
	full_path = os.path.join(VOICE_DIR, voice_path)
	if not os.path.exists(full_path):
	logger.warning(f"Voice file not found: {full_path}")
	else:
	logger.info(f"Loaded voice: {label} ({voice_path})")

	# If no voices are found, add a default fallback
	if not CHOICES:
	logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
	CHOICES = {"🇺🇸 🚺 Bella 🔥": "af_bella.pt"}

	TOKEN_NOTE = '''
	💡 Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/)

	💬 To adjust intonation, try punctuation ;:,.!?—…"()“” or stress ˈ and ˌ

	⬇️ Lower stress [1 level](-1) or [2 levels](-2)

	⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
	'''

	with gr.Blocks() as generate_tab:
	out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
	generate_btn = gr.Button("Generate", variant="primary")
	with gr.Accordion("Output Tokens", open=True):
	out_ps = gr.Textbox(interactive=False, show_label=False,
	info="Tokens used to generate the audio, up to 510 context length.")
	tokenize_btn = gr.Button("Tokenize", variant="secondary")
	gr.Markdown(TOKEN_NOTE)
	predict_btn = gr.Button("Predict", variant="secondary", visible=False)

	with gr.Blocks() as stream_tab:
	out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
	with gr.Row():
	stream_btn = gr.Button("Stream", variant="primary")
	stop_btn = gr.Button("Stop", variant="stop")
	with gr.Accordion("Note", open=True):
	gr.Markdown("⚠️ There is an unknown Gradio bug that might yield no audio the first time you click Stream.")
	gr.DuplicateButton()

	with gr.Blocks() as app:
	with gr.Row():
	with gr.Column():
	text = gr.Textbox(label="Input Text", info="Arbitrarily many characters supported")
	with gr.Row():
	voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
	info="Quality and availability vary by language")
	use_gpu = gr.Dropdown(
	[("GPU 🚀", True), ("CPU 🐌", False)],
	value=CUDA_AVAILABLE,
	label="Hardware",
	info="GPU is usually faster, but may require CUDA support",
	interactive=CUDA_AVAILABLE
	)
	speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
	with gr.Column():
	gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"])
	generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu],
	outputs=[out_audio, out_ps])
	tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
	stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
	stop_btn.click(fn=None, cancels=[stream_event])
	predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio])

	if __name__ == "__main__":
	app.queue().launch()