broadfield-dev commited on
Commit
78e02b1
·
verified ·
1 Parent(s): 88a0625

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -34
app.py CHANGED
@@ -1,43 +1,90 @@
1
  import gradio as gr
 
2
  from kittentts import KittenTTS
3
 
4
- # Initialize the KittenTTS model
5
- # This model is lightweight and runs on the CPU
6
- m = KittenTTS("KittenML/kitten-tts-nano-0.1")
 
 
 
 
 
 
 
7
 
8
- def text_to_speech(text):
9
- """
10
- Generates audio from the input text using the KittenTTS model.
11
 
12
- Args:
13
- text: The text to be converted to speech.
 
14
 
15
- Returns:
16
- A tuple containing the sample rate and the audio data as a NumPy array.
 
 
 
17
  """
18
- # The KittenTTS model generates audio at a sample rate of 24000 Hz
19
- sampling_rate = 24000
20
- audio_numpy = m.generate(text)
21
- return (sampling_rate, audio_numpy)
22
-
23
- # Define the Gradio interface
24
- iface = gr.Interface(
25
- fn=text_to_speech,
26
- inputs=gr.Textbox(
27
- lines=3,
28
- label="Text to Synthesize",
29
- placeholder="Enter your text here..."
30
- ),
31
- outputs=gr.Audio(label="Synthesized Speech"),
32
- title="KittenTTS: Text-to-Speech",
33
- description="A simple Gradio app to demonstrate the capabilities of the KittenTTS model. KittenTTS is a lightweight, high-quality text-to-speech model that can run on a CPU. [2]",
34
- examples=[
35
- ["This high quality TTS model works without a GPU"],
36
- ["Gradio is a great tool for creating machine learning demos."],
37
- ["The quick brown fox jumps over the lazy dog."]
38
- ],
39
- allow_flagging="never"
40
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # Launch the Gradio app
43
- iface.launch()
 
1
  import gradio as gr
2
+ import numpy as np
3
  from kittentts import KittenTTS
4
 
5
+ # 1. Initialize the KittenTTS model.
6
+ # This will download the model from Hugging Face on the first run.
7
+ print("Loading KittenTTS model...")
8
+ try:
9
+ tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1")
10
+ print("Model loaded successfully.")
11
+ except Exception as e:
12
+ print(f"Error loading model: {e}")
13
+ # You might want to handle this more gracefully
14
+ exit()
15
 
 
 
 
16
 
17
+ # 2. Get the list of available voices directly from the model instance.
18
+ AVAILABLE_VOICES = tts_model.available_voices
19
+ DEFAULT_VOICE = "expr-voice-5-m" if "expr-voice-5-m" in AVAILABLE_VOICES else AVAILABLE_VOICES[0]
20
 
21
+ # 3. Define the core function that Gradio will call.
22
+ # This function now accepts 'voice' and 'speed' as arguments.
23
+ def synthesize_speech(text, voice, speed):
24
+ """
25
+ Generates audio using the selected text, voice, and speed.
26
  """
27
+ # Handle empty input gracefully
28
+ if not text.strip():
29
+ # Return a silent, empty audio clip
30
+ return (24000, np.zeros(0, dtype=np.int16))
31
+
32
+ # Call the model's generate method with all the parameters
33
+ audio_data = tts_model.generate(text, voice=voice, speed=speed)
34
+
35
+ # Return the audio in the format Gradio expects: (sample_rate, numpy_array)
36
+ return (24000, audio_data)
37
+
38
+ # 4. Create the Gradio UI with the new controls.
39
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
40
+ gr.Markdown(
41
+ """
42
+ # 🐱 Enhanced KittenTTS UI
43
+ A user-friendly interface for the KittenTTS text-to-speech model.
44
+ Select a voice, adjust the speed, and type your text to generate audio.
45
+ """
46
+ )
47
+
48
+ with gr.Row():
49
+ with gr.Column(scale=3):
50
+ text_input = gr.Textbox(
51
+ lines=5,
52
+ label="Input Text",
53
+ placeholder="Type something here..."
54
+ )
55
+
56
+ with gr.Column(scale=1):
57
+ voice_dropdown = gr.Dropdown(
58
+ choices=AVAILABLE_VOICES,
59
+ value=DEFAULT_VOICE,
60
+ label="Voice Selection"
61
+ )
62
+ speed_slider = gr.Slider(
63
+ minimum=0.5,
64
+ maximum=2.0,
65
+ step=0.1,
66
+ value=1.0,
67
+ label="Speech Speed"
68
+ )
69
+ generate_button = gr.Button("Generate Audio", variant="primary")
70
+
71
+ audio_output = gr.Audio(label="Generated Speech", autoplay=True)
72
+
73
+ gr.Examples(
74
+ examples=[
75
+ ["This is an example of a female voice.", "expr-voice-5-f", 1.0],
76
+ ["This is an example of a male voice, speaking a bit faster.", "expr-voice-5-m", 1.2],
77
+ ["The speed can also be slowed down for clarity.", "expr-voice-4-f", 0.8],
78
+ ],
79
+ inputs=[text_input, voice_dropdown, speed_slider]
80
+ )
81
+
82
+ # Connect the UI components to the function
83
+ generate_button.click(
84
+ fn=synthesize_speech,
85
+ inputs=[text_input, voice_dropdown, speed_slider],
86
+ outputs=audio_output
87
+ )
88
 
89
  # Launch the Gradio app
90
+ demo.launch()