File size: 6,214 Bytes
1b567fa
5efbc82
 
9acb9c3
9a88d9c
 
9acb9c3
 
 
9a88d9c
 
 
 
9acb9c3
9a88d9c
 
 
 
 
9acb9c3
9a88d9c
 
 
9acb9c3
 
9a88d9c
9acb9c3
9a88d9c
 
 
 
5efbc82
 
 
9a88d9c
1b567fa
5efbc82
1b567fa
9acb9c3
9a88d9c
9acb9c3
 
9a88d9c
9acb9c3
9a88d9c
 
 
9acb9c3
5efbc82
9a88d9c
9acb9c3
 
5efbc82
1b567fa
9acb9c3
1b567fa
9acb9c3
 
 
 
 
 
 
 
9a88d9c
 
 
 
 
 
9acb9c3
 
1b567fa
9acb9c3
1b567fa
 
 
 
 
 
9acb9c3
 
 
 
9a88d9c
 
 
9acb9c3
9a88d9c
 
 
 
 
9acb9c3
 
 
 
 
9a88d9c
 
 
9acb9c3
 
9a88d9c
 
9acb9c3
9a88d9c
 
9acb9c3
 
9a88d9c
 
 
 
 
9acb9c3
 
 
 
 
 
 
9a88d9c
 
 
9acb9c3
9a88d9c
 
 
 
 
9acb9c3
9a88d9c
 
 
 
 
 
 
9acb9c3
 
9a88d9c
9acb9c3
9a88d9c
9acb9c3
9a88d9c
 
 
 
 
9acb9c3
 
 
 
 
 
 
 
9a88d9c
9acb9c3
 
 
 
 
 
 
 
 
 
9a88d9c
 
9acb9c3
9a88d9c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# Import configuration first to setup environment
import app_config

from fastapi import FastAPI, HTTPException, Form
from fastapi.responses import FileResponse
from pydantic import BaseModel
from kokoro import KPipeline
import soundfile as sf
import torch
import os
import tempfile
import uuid
import logging
from typing import Optional

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(title="Kokoro TTS API", description="Text-to-Speech API using Kokoro", version="1.0.0")

class TTSRequest(BaseModel):
    text: str
    voice: str = "af_heart"
    lang_code: str = "a"

class KokoroTTSService:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Using device: {self.device}")
        
        if app_config.is_hf_spaces():
            logger.info("Running on Hugging Face Spaces")
        
        try:
            # Initialize Kokoro pipeline following the working example pattern
            logger.info("Initializing Kokoro TTS pipeline...")
            self.pipeline = KPipeline(lang_code='a')
            logger.info("Kokoro TTS pipeline loaded successfully")
        except Exception as e:
            logger.error(f"Failed to load Kokoro TTS pipeline: {e}")
            raise e

    def generate_speech(self, text: str, voice: str = "af_heart", lang_code: str = "a") -> str:
        """Generate speech and return the path to the output file"""
        try:
            # Create a unique filename for the output
            output_filename = f"kokoro_output_{uuid.uuid4().hex}.wav"
            output_path = os.path.join(app_config.get_temp_dir(), output_filename)
            
            # Update pipeline language if different
            if self.pipeline.lang_code != lang_code:
                logger.info(f"Switching language from {self.pipeline.lang_code} to {lang_code}")
                self.pipeline = KPipeline(lang_code=lang_code)
            
            # Generate speech using Kokoro (following the working example pattern)
            generator = self.pipeline(text, voice=voice)
            
            # Get the first (and typically only) audio output
            for i, (gs, ps, audio) in enumerate(generator):
                logger.info(f"Generated audio segment {i}: gs={gs}, ps={ps}")
                # Save the audio to file
                sf.write(output_path, audio, 24000)
                break  # Take the first generated audio
            
            return output_path
        except Exception as e:
            logger.error(f"Error generating speech: {e}")
            raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")

    def get_available_voices(self):
        """Return list of available voices"""
        # Extended list based on the working example
        return [
            "af_heart", "af_bella", "af_nicole", "af_aoede", "af_kore", 
            "af_sarah", "af_nova", "af_sky", "af_alloy", "af_jessica", "af_river",
            "am_michael", "am_fenrir", "am_puck", "am_echo", "am_eric", 
            "am_liam", "am_onyx", "am_santa", "am_adam",
            "bf_emma", "bf_isabella", "bf_alice", "bf_lily",
            "bm_george", "bm_fable", "bm_lewis", "bm_daniel"
        ]

# Initialize Kokoro TTS service
tts_service = KokoroTTSService()

@app.get("/")
async def root():
    return {"message": "Kokoro TTS API is running", "status": "healthy"}

@app.get("/health")
async def health_check():
    return {"status": "healthy", "device": tts_service.device}

@app.get("/voices")
async def get_voices():
    """Get list of available voices"""
    return {"voices": tts_service.get_available_voices()}

@app.post("/tts")
async def text_to_speech(
    text: str = Form(...),
    voice: str = Form("af_heart"),
    lang_code: str = Form("a")
):
    """
    Convert text to speech using Kokoro TTS
    
    - **text**: The text to convert to speech
    - **voice**: Voice to use (default: "af_heart")
    - **lang_code**: Language code (default: "a" for auto-detect)
    """
    
    if not text.strip():
        raise HTTPException(status_code=400, detail="Text cannot be empty")
    
    # Validate voice
    available_voices = tts_service.get_available_voices()
    if voice not in available_voices:
        raise HTTPException(
            status_code=400, 
            detail=f"Voice '{voice}' not available. Available voices: {available_voices}"
        )
    
    try:
        # Generate speech
        output_path = tts_service.generate_speech(text, voice, lang_code)
        
        # Return the generated audio file
        return FileResponse(
            output_path,
            media_type="audio/wav",
            filename=f"kokoro_tts_{voice}_{uuid.uuid4().hex}.wav",
            headers={"Content-Disposition": "attachment"}
        )
        
    except Exception as e:
        logger.error(f"Error in TTS endpoint: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/tts-json")
async def text_to_speech_json(request: TTSRequest):
    """
    Convert text to speech using JSON request body
    
    - **request**: TTSRequest containing text, voice, and lang_code
    """
    
    if not request.text.strip():
        raise HTTPException(status_code=400, detail="Text cannot be empty")
    
    # Validate voice
    available_voices = tts_service.get_available_voices()
    if request.voice not in available_voices:
        raise HTTPException(
            status_code=400, 
            detail=f"Voice '{request.voice}' not available. Available voices: {available_voices}"
        )
    
    try:
        # Generate speech
        output_path = tts_service.generate_speech(request.text, request.voice, request.lang_code)
        
        # Return the generated audio file
        return FileResponse(
            output_path,
            media_type="audio/wav",
            filename=f"kokoro_tts_{request.voice}_{uuid.uuid4().hex}.wav",
            headers={"Content-Disposition": "attachment"}
        )
        
    except Exception as e:
        logger.error(f"Error in TTS JSON endpoint: {e}")
        raise HTTPException(status_code=500, detail=str(e))