TTS / app.py
Hassan-16's picture
Update app.py
1684657 verified
raw
history blame
8.03 kB
import gradio as gr
import os
import torch
import logging
import soundfile as sf
import time
from kokoro import KModel, KPipeline
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Configuration
VOICE_DIR = os.path.join(os.path.dirname(__file__), "voices")
OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "output_audio")
TEXT = "Hello, this is a test of the Kokoro TTS system."
# Ensure directories exist
os.makedirs(VOICE_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Device setup
CUDA_AVAILABLE = torch.cuda.is_available()
device = "cuda" if CUDA_AVAILABLE else "cpu"
logger.info(f"Using hardware: {device}")
# Cache model in a persistent directory
MODEL_CACHE_DIR = os.path.join(os.path.dirname(__file__), "model_cache")
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
# Load a single model instance with caching
try:
start_time = time.time()
model = KModel("hexgrad/Kokoro-82M", cache_dir=MODEL_CACHE_DIR).to(device).eval()
logger.info(f"Model loading time: {time.time() - start_time} seconds")
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise
# Define pipelines for American ('a') and British ('b') English
pipelines = {
'a': KPipeline(model=model, lang_code='a', device=device), # American English
'b': KPipeline(model=model, lang_code='b', device=device) # British English
}
# Set custom pronunciations for "kokoro"
try:
pipelines["a"].g2p.lexicon.golds["kokoro"] = "kหˆOkษ™ษนO"
pipelines["b"].g2p.lexicon.golds["kokoro"] = "kหˆQkษ™ษนQ"
except AttributeError as e:
logger.warning(f"Could not set custom pronunciations: {e}")
# Cache voice choices to avoid repeated file scanning
VOICE_CHOICES = None
def load_voice_choices():
global VOICE_CHOICES
if VOICE_CHOICES is not None:
return VOICE_CHOICES
voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')]
choices = {}
for voice_file in voice_files:
prefix = voice_file[:2]
if prefix == 'af':
label = f"๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ {voice_file[3:-3].capitalize()}"
elif prefix == 'am':
label = f"๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšน {voice_file[3:-3].capitalize()}"
elif prefix == 'bf':
label = f"๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšบ {voice_file[3:-3].capitalize()}"
elif prefix == 'bm':
label = f"๐Ÿ‡ฌ๐Ÿ‡ง ๐Ÿšน {voice_file[3:-3].capitalize()}"
else:
label = f"Unknown {voice_file[:-3]}"
choices[label] = voice_file
if not choices:
logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.")
choices = {"๐Ÿ‡บ๐Ÿ‡ธ ๐Ÿšบ Bella ๐Ÿ”ฅ": "af_bella.pt"}
VOICE_CHOICES = choices
return choices
CHOICES = load_voice_choices()
# Log available voices
for label, voice_path in CHOICES.items():
full_path = os.path.join(VOICE_DIR, voice_path)
if not os.path.exists(full_path):
logger.warning(f"Voice file not found: {full_path}")
else:
logger.info(f"Loaded voice: {label} ({voice_path})")
def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
start_time = time.time()
voice_path = os.path.join(VOICE_DIR, voice)
if not os.path.exists(voice_path):
raise FileNotFoundError(f"Voice file not found: {voice_path}")
pipeline = pipelines[voice[0]]
use_gpu = use_gpu and CUDA_AVAILABLE
try:
if not use_gpu and model.device.type != "cpu":
model.to("cpu")
generator = pipeline(text, voice=voice_path, speed=speed)
for _, ps, audio in generator:
logger.info(f"Generation time: {time.time() - start_time} seconds")
return (24000, audio.numpy()), ps
except gr.exceptions.Error as e:
if use_gpu:
gr.Warning(str(e))
gr.Info("Retrying with CPU.")
model.to("cpu")
generator = pipeline(text, voice=voice_path, speed=speed)
for _, ps, audio in generator:
logger.info(f"Generation time (CPU retry): {time.time() - start_time} seconds")
return (24000, audio.numpy()), ps
else:
raise gr.Error(e)
return None, ""
def tokenize_first(text, voice="af_bella.pt"):
voice_path = os.path.join(VOICE_DIR, voice)
if not os.path.exists(voice_path):
raise FileNotFoundError(f"Voice file not found: {voice_path}")
pipeline = pipelines[voice[0]]
generator = pipeline(text, voice=voice_path)
for _, ps, _ in generator:
return ps
return ""
def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE):
start_time = time.time()
voice_path = os.path.join(VOICE_DIR, voice)
if not os.path.exists(voice_path):
raise FileNotFoundError(f"Voice file not found: {voice_path}")
pipeline = pipelines[voice[0]]
use_gpu = use_gpu and CUDA_AVAILABLE
if not use_gpu and model.device.type != "cpu":
model.to("cpu")
first = True
generator = pipeline(text, voice=voice_path, speed=speed)
for _, _, audio in generator:
yield 24000, audio.numpy()
if first:
first = False
yield 24000, torch.zeros(1).numpy()
logger.info(f"Streaming generation time: {time.time() - start_time} seconds")
TOKEN_NOTE = '''
๐Ÿ’ก Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kหˆOkษ™ษนO/)
๐Ÿ’ฌ To adjust intonation, try punctuation ;:,.!?โ€”โ€ฆ"()โ€œโ€ or stress หˆ and หŒ
โฌ‡๏ธ Lower stress [1 level](-1) or [2 levels](-2)
โฌ†๏ธ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
'''
with gr.Blocks(theme="soft") as app:
with gr.Row():
with gr.Column():
text = gr.Textbox(label="Input Text", value=TEXT, info="Arbitrarily many characters supported")
with gr.Row():
voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice",
info="Quality and availability vary by language")
use_gpu = gr.Dropdown(
[("GPU ๐Ÿš€", True), ("CPU ๐ŸŒ", False)],
value=CUDA_AVAILABLE,
label="Hardware",
info="GPU is faster but requires CUDA support",
interactive=CUDA_AVAILABLE
)
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
with gr.Column():
with gr.Tab(label="Generate"):
out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True)
generate_btn = gr.Button("Generate", variant="primary")
with gr.Accordion("Output Tokens", open=True):
out_ps = gr.Textbox(interactive=False, show_label=False,
info="Tokens used to generate the audio, up to 510 context length.")
tokenize_btn = gr.Button("Tokenize", variant="secondary")
gr.Markdown(TOKEN_NOTE)
with gr.Tab(label="Stream"):
out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True)
with gr.Row():
stream_btn = gr.Button("Stream", variant="primary")
stop_btn = gr.Button("Stop", variant="stop")
gr.Markdown("โš ๏ธ Streaming may have initial delays due to processing.")
generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps])
tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream])
stop_btn.click(fn=None, cancels=[stream_event])
if __name__ == "__main__":
logger.info("Starting Gradio app...")
app.launch()
logger.info("Gradio app started.")