import os import torch import time from pydub import AudioSegment from phonemizer.backend.espeak.wrapper import EspeakWrapper from models import build_model # Hugging Face Spaces setup MODEL_DIR = "./kokoro" os.makedirs(MODEL_DIR, exist_ok=True) # Configure espeak-ng for Hugging Face environment EspeakWrapper.set_library('/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1') class TTSEngine: def __init__(self): self.device = "cuda" if torch.cuda.is_available() else "cpu" self._verify_model_files() self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device) self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt", map_location=self.device) def _verify_model_files(self): """Ensure required model files exist""" required_files = [ f"{MODEL_DIR}/kokoro-v0_19.pth", f"{MODEL_DIR}/voices/af_bella.pt" ] missing = [f for f in required_files if not os.path.exists(f)] if missing: raise FileNotFoundError( f"Missing model files: {missing}\n" "Add this to your Hugging Face Space settings:\n" "App setup -> Clone Kokoro repository: " "git clone https://huggingface.co/hexgrad/Kokoro-82M ./kokoro" ) def generate_speech(self, text: str, language: str = "zh") -> str: """Generate speech from Chinese text""" from kokoro import generate_full # Safety checks for Hugging Face Free Tier if len(text) > 500: text = text[:495] + "[TRUNCATED]" audio, _ = generate_full( self.model, text, self.voice, lang='en-us', max_len=200 if self.device == "cpu" else 500 ) # Save output output_path = f"temp/outputs/output_{int(time.time())}.wav" AudioSegment( audio.numpy().tobytes(), frame_rate=24000, sample_width=2, channels=1 ).export(output_path, format="wav") return output_path # Initialize TTS engine once @st.cache_resource def get_tts_engine(): return TTSEngine() def generate_speech(text: str, language: str = "zh") -> str: """Public interface for TTS generation""" return get_tts_engine().generate_speech(text, language)