Spaces:

DroolingPanda
/

teachingAssistant

Running

Michael Hu

switch to use kokoro

9c8546d 4 months ago

2.42 kB

	import os
	import torch
	import time
	from pydub import AudioSegment
	from phonemizer.backend.espeak.wrapper import EspeakWrapper
	from models import build_model

	# Hugging Face Spaces setup
	MODEL_DIR = "./kokoro"
	os.makedirs(MODEL_DIR, exist_ok=True)

	# Configure espeak-ng for Hugging Face environment
	EspeakWrapper.set_library('/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1')

	class TTSEngine:
	def __init__(self):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self._verify_model_files()
	self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device)
	self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt",
	map_location=self.device)

	def _verify_model_files(self):
	"""Ensure required model files exist"""
	required_files = [
	f"{MODEL_DIR}/kokoro-v0_19.pth",
	f"{MODEL_DIR}/voices/af_bella.pt"
	]

	missing = [f for f in required_files if not os.path.exists(f)]
	if missing:
	raise FileNotFoundError(
	f"Missing model files: {missing}\n"
	"Add this to your Hugging Face Space settings:\n"
	"App setup -> Clone Kokoro repository: "
	"git clone https://huggingface.co/hexgrad/Kokoro-82M ./kokoro"
	)

	def generate_speech(self, text: str, language: str = "zh") -> str:
	"""Generate speech from Chinese text"""
	from kokoro import generate_full

	# Safety checks for Hugging Face Free Tier
	if len(text) > 500:
	text = text[:495] + "[TRUNCATED]"

	audio, _ = generate_full(
	self.model,
	text,
	self.voice,
	lang='en-us',
	max_len=200 if self.device == "cpu" else 500
	)

	# Save output
	output_path = f"temp/outputs/output_{int(time.time())}.wav"
	AudioSegment(
	audio.numpy().tobytes(),
	frame_rate=24000,
	sample_width=2,
	channels=1
	).export(output_path, format="wav")

	return output_path

	# Initialize TTS engine once
	@st.cache_resource
	def get_tts_engine():
	return TTSEngine()

	def generate_speech(text: str, language: str = "zh") -> str:
	"""Public interface for TTS generation"""
	return get_tts_engine().generate_speech(text, language)