Michael Hu
switch to use kokoro
9c8546d
raw
history blame
2.42 kB
import os
import torch
import time
from pydub import AudioSegment
from phonemizer.backend.espeak.wrapper import EspeakWrapper
from models import build_model
# Hugging Face Spaces setup
MODEL_DIR = "./kokoro"
os.makedirs(MODEL_DIR, exist_ok=True)
# Configure espeak-ng for Hugging Face environment
EspeakWrapper.set_library('/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1')
class TTSEngine:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self._verify_model_files()
self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device)
self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt",
map_location=self.device)
def _verify_model_files(self):
"""Ensure required model files exist"""
required_files = [
f"{MODEL_DIR}/kokoro-v0_19.pth",
f"{MODEL_DIR}/voices/af_bella.pt"
]
missing = [f for f in required_files if not os.path.exists(f)]
if missing:
raise FileNotFoundError(
f"Missing model files: {missing}\n"
"Add this to your Hugging Face Space settings:\n"
"App setup -> Clone Kokoro repository: "
"git clone https://huggingface.co/hexgrad/Kokoro-82M ./kokoro"
)
def generate_speech(self, text: str, language: str = "zh") -> str:
"""Generate speech from Chinese text"""
from kokoro import generate_full
# Safety checks for Hugging Face Free Tier
if len(text) > 500:
text = text[:495] + "[TRUNCATED]"
audio, _ = generate_full(
self.model,
text,
self.voice,
lang='en-us',
max_len=200 if self.device == "cpu" else 500
)
# Save output
output_path = f"temp/outputs/output_{int(time.time())}.wav"
AudioSegment(
audio.numpy().tobytes(),
frame_rate=24000,
sample_width=2,
channels=1
).export(output_path, format="wav")
return output_path
# Initialize TTS engine once
@st.cache_resource
def get_tts_engine():
return TTSEngine()
def generate_speech(text: str, language: str = "zh") -> str:
"""Public interface for TTS generation"""
return get_tts_engine().generate_speech(text, language)