import gradio as gr import os import torch import logging import soundfile as sf from kokoro import KModel, KPipeline # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Configuration VOICE_DIR = os.path.join(os.path.dirname(__file__), "voices") OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "output_audio") TEXT = "Hello, this is a test of the Kokoro TTS system." # Ensure directories exist os.makedirs(VOICE_DIR, exist_ok=True) os.makedirs(OUTPUT_DIR, exist_ok=True) # Device setup CUDA_AVAILABLE = torch.cuda.is_available() device = "cuda" if CUDA_AVAILABLE else "cpu" logger.info(f"Using hardware: {device}") # Load a single model instance model = KModel("hexgrad/Kokoro-82M").to(device).eval() # Define pipelines for American ('a') and British ('b') English pipelines = { 'a': KPipeline(model=model, lang_code='a', device=device), # American English 'b': KPipeline(model=model, lang_code='b', device=device) # British English } # Set custom pronunciations for "kokoro" try: pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO" pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ" except AttributeError as e: logger.warning(f"Could not set custom pronunciations: {e}") def generate_first(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE): voice_path = os.path.join(VOICE_DIR, voice) if not os.path.exists(voice_path): raise FileNotFoundError(f"Voice file not found: {voice_path}") pipeline = pipelines[voice[0]] use_gpu = use_gpu and CUDA_AVAILABLE try: generator = pipeline(text, voice=voice_path, speed=speed) for _, ps, audio in generator: return (24000, audio.numpy()), ps except gr.exceptions.Error as e: if use_gpu: gr.Warning(str(e)) gr.Info("Retrying with CPU. To avoid this error, change Hardware to CPU.") model.to("cpu") generator = pipeline(text, voice=voice_path, speed=speed) for _, ps, audio in generator: return (24000, audio.numpy()), ps else: raise gr.Error(e) return None, "" def tokenize_first(text, voice="af_bella.pt"): voice_path = os.path.join(VOICE_DIR, voice) if not os.path.exists(voice_path): raise FileNotFoundError(f"Voice file not found: {voice_path}") pipeline = pipelines[voice[0]] generator = pipeline(text, voice=voice_path) for _, ps, _ in generator: return ps return "" def generate_all(text, voice="af_bella.pt", speed=1, use_gpu=CUDA_AVAILABLE): voice_path = os.path.join(VOICE_DIR, voice) if not os.path.exists(voice_path): raise FileNotFoundError(f"Voice file not found: {voice_path}") pipeline = pipelines[voice[0]] use_gpu = use_gpu and CUDA_AVAILABLE first = True if not use_gpu: model.to("cpu") generator = pipeline(text, voice=voice_path, speed=speed) for _, _, audio in generator: yield 24000, audio.numpy() if first: first = False yield 24000, torch.zeros(1).numpy() # Dynamically load .pt voice files from VOICE_DIR def load_voice_choices(): voice_files = [f for f in os.listdir(VOICE_DIR) if f.endswith('.pt')] choices = {} for voice_file in voice_files: prefix = voice_file[:2] if prefix == 'af': label = f"🇺🇸 🚺 {voice_file[3:-3].capitalize()}" elif prefix == 'am': label = f"🇺🇸 🚹 {voice_file[3:-3].capitalize()}" elif prefix == 'bf': label = f"🇬🇧 🚺 {voice_file[3:-3].capitalize()}" elif prefix == 'bm': label = f"🇬🇧 🚹 {voice_file[3:-3].capitalize()}" else: label = f"Unknown {voice_file[:-3]}" choices[label] = voice_file return choices CHOICES = load_voice_choices() # Log available voices for label, voice_path in CHOICES.items(): full_path = os.path.join(VOICE_DIR, voice_path) if not os.path.exists(full_path): logger.warning(f"Voice file not found: {full_path}") else: logger.info(f"Loaded voice: {label} ({voice_path})") # If no voices are found, add a default fallback if not CHOICES: logger.warning("No voice files found in VOICE_DIR. Adding a placeholder.") CHOICES = {"🇺🇸 🚺 Bella 🔥": "af_bella.pt"} TOKEN_NOTE = ''' 💡 Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/) 💬 To adjust intonation, try punctuation ;:,.!?—…"()“” or stress ˈ and ˌ ⬇️ Lower stress [1 level](-1) or [2 levels](-2) ⬆️ Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words) ''' with gr.Blocks() as generate_tab: out_audio = gr.Audio(label="Output Audio", interactive=False, streaming=False, autoplay=True) generate_btn = gr.Button("Generate", variant="primary") with gr.Accordion("Output Tokens", open=True): out_ps = gr.Textbox(interactive=False, show_label=False, info="Tokens used to generate the audio, up to 510 context length.") tokenize_btn = gr.Button("Tokenize", variant="secondary") gr.Markdown(TOKEN_NOTE) with gr.Blocks() as stream_tab: out_stream = gr.Audio(label="Output Audio Stream", interactive=False, streaming=True, autoplay=True) with gr.Row(): stream_btn = gr.Button("Stream", variant="primary") stop_btn = gr.Button("Stop", variant="stop") with gr.Accordion("Note", open=True): gr.Markdown("⚠️ There may be delays in streaming audio due to processing limitations.") with gr.Blocks() as app: with gr.Row(): with gr.Column(): text = gr.Textbox(label="Input Text", info="Arbitrarily many characters supported") with gr.Row(): voice = gr.Dropdown(list(CHOICES.items()), value="af_bella.pt" if "af_bella.pt" in CHOICES.values() else list(CHOICES.values())[0], label="Voice", info="Quality and availability vary by language") use_gpu = gr.Dropdown( [("GPU �-held", True), ("CPU 🐌", False)], value=CUDA_AVAILABLE, label="Hardware", info="GPU is usually faster, but may require CUDA support", interactive=CUDA_AVAILABLE ) speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed") with gr.Column(): gr.TabbedInterface([generate_tab, stream_tab], ["Generate", "Stream"]) generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps]) tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps]) stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream]) stop_btn.click(fn=None, cancels=[stream_event]) if __name__ == "__main__": app.queue().launch()