Spaces:
Running
Running
| ####################################################################################### | |
| # | |
| # MIT License | |
| # | |
| # Copyright (c) [2025] [[email protected]] | |
| # | |
| # Permission is hereby granted, free of charge, to any person obtaining a copy | |
| # of this software and associated documentation files (the "Software"), to deal | |
| # in the Software without restriction, including without limitation the rights | |
| # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| # copies of the Software, and to permit persons to whom the Software is | |
| # furnished to do so, subject to the following conditions: | |
| # | |
| # The above copyright notice and this permission notice shall be included in all | |
| # copies or substantial portions of the Software. | |
| # | |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| # SOFTWARE. | |
| # | |
| ####################################################################################### | |
| # This file implements an API endpoint for the English Kokoro Text-to-Speech (TTS) system. | |
| # It provides functionality to generate TTS audio from input English text using the Kokoro voice model. | |
| # Source code is based on or inspired by several projects. | |
| # For more details and proper attribution, please refer to the following resources: | |
| # | |
| # - [Kokoro] - [https://github.com/hexgrad/kokoro] | |
| # - [Misaki] - [https://github.com/hexgrad/misaki] | |
| # - [Kokoro-82M] - [https://huggingface.co/hexgrad/Kokoro-82M] | |
| # - [Kokoro-onnx] - [https://github.com/thewh1teagle/kokoro-onnx] | |
| import os | |
| import gradio as gr | |
| from huggingface_hub import snapshot_download | |
| from kokoro_onnx import Kokoro | |
| from misaki import en, espeak | |
| KOKORO_REPO_ID = "leonelhs/kokoro-thewh1teagle" | |
| VOICES = { | |
| 'πΊπΈ πΊ Heart β€οΈ': 'af_heart', | |
| 'πΊπΈ πΊ Bella π₯': 'af_bella', | |
| 'πΊπΈ πΊ Nicole π§': 'af_nicole', | |
| 'πΊπΈ πΊ Aoede': 'af_aoede', | |
| 'πΊπΈ πΊ Kore': 'af_kore', | |
| 'πΊπΈ πΊ Sarah': 'af_sarah', | |
| 'πΊπΈ πΊ Nova': 'af_nova', | |
| 'πΊπΈ πΊ Sky': 'af_sky', | |
| 'πΊπΈ πΊ Alloy': 'af_alloy', | |
| 'πΊπΈ πΊ Jessica': 'af_jessica', | |
| 'πΊπΈ πΊ River': 'af_river', | |
| 'πΊπΈ πΉ Michael': 'am_michael', | |
| 'πΊπΈ πΉ Fenrir': 'am_fenrir', | |
| 'πΊπΈ πΉ Puck': 'am_puck', | |
| 'πΊπΈ πΉ Echo': 'am_echo', | |
| 'πΊπΈ πΉ Eric': 'am_eric', | |
| 'πΊπΈ πΉ Liam': 'am_liam', | |
| 'πΊπΈ πΉ Onyx': 'am_onyx', | |
| 'πΊπΈ πΉ Santa': 'am_santa', | |
| 'πΊπΈ πΉ Adam': 'am_adam', | |
| 'π¬π§ πΊ Emma': 'bf_emma', | |
| 'π¬π§ πΊ Isabella': 'bf_isabella', | |
| 'π¬π§ πΊ Alice': 'bf_alice', | |
| 'π¬π§ πΊ Lily': 'bf_lily', | |
| 'π¬π§ πΉ George': 'bm_george', | |
| 'π¬π§ πΉ Fable': 'bm_fable', | |
| 'π¬π§ πΉ Lewis': 'bm_lewis', | |
| 'π¬π§ πΉ Daniel': 'bm_daniel', | |
| } | |
| snapshot = snapshot_download(repo_id=KOKORO_REPO_ID) | |
| # Misaki G2P with espeak-ng fallback | |
| fallback = espeak.EspeakFallback(british=False) | |
| g2p = en.G2P(trf=False, british=False, fallback=fallback) | |
| # Kokoro | |
| model_path = os.path.join(snapshot, "kokoro-v1.0.onnx") | |
| voices_path = os.path.join(snapshot, "voices-v1.0.bin") | |
| kokoro = Kokoro(model_path, voices_path) | |
| def predict(text, voice='af_heart', speed=1): | |
| """ | |
| Generate speech audio from english text input. | |
| Parameters: | |
| text (string): The text to be converted into speech. | |
| voice (string): The selected male of female voice profile (specific voice ID). | |
| speed (float): The speaking rate multiplier (e.g., 1.0 = normal speed, 0.8 = slower, 1.2 = faster). | |
| Returns: | |
| path: File path to the generated audio speech. | |
| """ | |
| phonemes, _ = g2p(text) | |
| samples, sample_rate = kokoro.create(phonemes, voice, speed, is_phonemes=True) | |
| return sample_rate, samples | |
| app = gr.Interface( | |
| predict, | |
| [ | |
| gr.Textbox(label='Input Text'), | |
| gr.Dropdown(list(VOICES.items()), value='af_heart', label='Voice'), | |
| gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed') | |
| ], | |
| gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True), | |
| description="Kokoro TTS πΊπΈ π¬π§ API Endpoint", | |
| ) | |
| app.launch(share=False, debug=True, show_error=True, mcp_server=True) | |
| app.queue() | |