File size: 2,419 Bytes
9c8546d
933cc7f
9c8546d
933cc7f
9c8546d
 
 
 
 
 
cd1309d
9c8546d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933cc7f
9c8546d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933cc7f
9c8546d
 
 
 
933cc7f
 
9c8546d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import torch
import time
from pydub import AudioSegment
from phonemizer.backend.espeak.wrapper import EspeakWrapper
from models import build_model

# Hugging Face Spaces setup
MODEL_DIR = "./kokoro"
os.makedirs(MODEL_DIR, exist_ok=True)

# Configure espeak-ng for Hugging Face environment
EspeakWrapper.set_library('/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1')

class TTSEngine:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self._verify_model_files()
        self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device)
        self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt", 
                              map_location=self.device)
    
    def _verify_model_files(self):
        """Ensure required model files exist"""
        required_files = [
            f"{MODEL_DIR}/kokoro-v0_19.pth",
            f"{MODEL_DIR}/voices/af_bella.pt"
        ]
        
        missing = [f for f in required_files if not os.path.exists(f)]
        if missing:
            raise FileNotFoundError(
                f"Missing model files: {missing}\n"
                "Add this to your Hugging Face Space settings:\n"
                "App setup -> Clone Kokoro repository: "
                "git clone https://huggingface.co/hexgrad/Kokoro-82M ./kokoro"
            )

    def generate_speech(self, text: str, language: str = "zh") -> str:
        """Generate speech from Chinese text"""
        from kokoro import generate_full
        
        # Safety checks for Hugging Face Free Tier
        if len(text) > 500:
            text = text[:495] + "[TRUNCATED]"
        
        audio, _ = generate_full(
            self.model,
            text,
            self.voice,
            lang='en-us',
            max_len=200 if self.device == "cpu" else 500
        )
        
        # Save output
        output_path = f"temp/outputs/output_{int(time.time())}.wav"
        AudioSegment(
            audio.numpy().tobytes(),
            frame_rate=24000,
            sample_width=2,
            channels=1
        ).export(output_path, format="wav")
        
        return output_path

# Initialize TTS engine once
@st.cache_resource
def get_tts_engine():
    return TTSEngine()

def generate_speech(text: str, language: str = "zh") -> str:
    """Public interface for TTS generation"""
    return get_tts_engine().generate_speech(text, language)