File size: 5,493 Bytes
ed74e8e
618f849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c732fbe
618f849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c732fbe
618f849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed74e8e
 
 
618f849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#copied from https://huggingface.co/spaces/KingNish/Kitten-TTS & Modified to handle large text input.
import gradio as gr
import tempfile
import uuid
import os
import re
import numpy as np
import soundfile as sf
from kittentts import KittenTTS
from tqdm.auto import tqdm
# Initialize the TTS model
model = KittenTTS("KittenML/kitten-tts-nano-0.1")

def split_text_into_chunks(text, chunk_size=400):
    """
    Split long text into smaller chunks of max length `chunk_size`.
    """
    # Split by punctuation followed by space (preserves sentence boundaries)
    sentences = re.split(r'(?<=[.!?]) +', text)
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) > chunk_size:
            if current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = ""
        current_chunk += sentence + " "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def generate_speech(text, voice, speed):
    """
    Generate speech from long text in a memory-efficient way.
    Writes chunks directly to a shared WAV file instead of keeping them in memory.
    """
    if not text.strip():
        return None, "Please enter some text to generate speech."

    try:
        # Break text into manageable chunks
        chunks = split_text_into_chunks(text, chunk_size=400)

        # Shared output directory (update this path to your shared disk)
        shared_dir = "./saved_audio"
        os.makedirs(shared_dir, exist_ok=True)

        unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
        output_path = os.path.join(shared_dir, unique_filename)

        # Open the WAV file for writing
        with sf.SoundFile(output_path, mode='w', samplerate=24000, channels=1, subtype='PCM_16') as f:
            for chunk in tqdm(chunks, desc="Streaming audio to disk", unit="chunk"):
                audio = model.generate(chunk+" ....", voice=voice, speed=speed)
                f.write(audio)  # Write audio directly to disk

        return output_path
    except Exception as e:
        return None, f"Error during TTS generation: {str(e)}"

def get_available_voices():
    """Get list of available voices from the model."""
    try:
        voices = model.available_voices
        return voices if voices else ["expr-voice-5-m"]
    except:
        return ["expr-voice-5-m"]

# Get voices once on load
available_voices = get_available_voices()

# Create Gradio UI
with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator")
    gr.Markdown("Convert your text to high-quality speech using the KittenTTS nano model!")

    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Text to Convert",
                placeholder="Enter the text you want to convert to speech...",
                lines=4,
                max_lines=10
            )
            
            with gr.Row():
                voice_dropdown = gr.Dropdown(
                    choices=available_voices,
                    value=available_voices[0],
                    label="Voice Selection",
                    info="Choose the voice for speech generation"
                )
                
                speed_slider = gr.Slider(
                    minimum=0.5,
                    maximum=2.0,
                    step=0.01,
                    value=1,
                    label="Speech Speed",
                    info="Adjust the speed of speech (0.5x to 2.0x)"
                )
            
            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath",
                interactive=False,
                autoplay=True
            )

    gr.Markdown("## 📝 Example Texts")
    gr.Examples(
        examples=[
            ["Hello! This is a test of the KittenTTS model.", available_voices[0], 1],
            ["The quick brown fox jumps over the lazy dog.", available_voices[0], 1.25],
            ["Welcome to the world of high-quality text-to-speech synthesis!", available_voices[0], 1.5],
        ],
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output],
        fn=generate_speech,
        label="Click on an example to try it out",
        # cache_examples="lazy"
    )

    with gr.Accordion("ℹ️ Model Information", open=False):
        gr.Markdown("""
        **Model:** `KittenML/kitten-tts-nano-0.1`  
        **Features:**  
        - High-quality text-to-speech synthesis  
        - Works without GPU acceleration  
        - Multiple voice options  
        - Adjustable speech speed  
        - 24kHz audio output  

        **Usage Instructions:**  
        1. Enter your text  
        2. Select a voice  
        3. Adjust the speech speed if needed  
        4. Click "Generate Speech"  
        """)

    # Event Bindings
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output]
    )

    text_input.submit(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output]
    )

# Run the app
if __name__ == "__main__":
    app.queue().launch()