Update app.py
Browse files
app.py
CHANGED
@@ -1,67 +1,76 @@
|
|
1 |
import gradio as gr
|
2 |
import google.generativeai as genai
|
|
|
3 |
import time
|
4 |
import os
|
|
|
5 |
|
6 |
# --- Load API Key from Hugging Face Secrets ---
|
7 |
-
#
|
8 |
# settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
|
9 |
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
10 |
|
11 |
-
# --- Helper
|
12 |
-
def create_unique_wav_file(
|
13 |
-
"""Saves audio data to a uniquely named WAV file and returns the path."""
|
14 |
-
# Create a directory to store audio outputs if it doesn't exist
|
15 |
output_dir = "audio_outputs"
|
16 |
os.makedirs(output_dir, exist_ok=True)
|
17 |
|
18 |
-
# Generate a unique filename using a timestamp
|
19 |
timestamp = int(time.time())
|
20 |
file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
|
21 |
|
22 |
-
# The API returns a complete WAV file, so we just write the bytes directly.
|
23 |
try:
|
24 |
-
with open(file_name,
|
25 |
-
|
|
|
|
|
|
|
26 |
return file_name
|
27 |
except Exception as e:
|
28 |
print(f"Error saving wave file: {e}")
|
29 |
raise gr.Error(f"Could not save audio file. Error: {e}")
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
def synthesize_speech(text):
|
34 |
"""
|
35 |
-
Synthesizes speech from text using the Gemini API.
|
36 |
-
This function uses the API key loaded from Hugging Face secrets.
|
37 |
"""
|
38 |
# 1. Validate Inputs (API Key and Text)
|
39 |
if not GOOGLE_API_KEY:
|
40 |
raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
|
41 |
if not text or not text.strip():
|
42 |
raise gr.Error("Please enter some text to synthesize.")
|
|
|
|
|
43 |
|
44 |
try:
|
45 |
-
# 2. Configure the Gemini
|
46 |
-
genai.
|
47 |
-
|
48 |
-
# 3. Call the Text-to-Speech Model
|
49 |
-
# We use the 'tts-1' model which is optimized for this task.
|
50 |
-
model = genai.GenerativeModel(model_name='tts-1')
|
51 |
|
52 |
-
#
|
53 |
-
prompt = f"
|
54 |
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
-
# 4.
|
59 |
-
|
60 |
-
|
61 |
-
audio_file_path = create_unique_wav_file(
|
62 |
return audio_file_path
|
63 |
else:
|
64 |
-
# Handle cases where audio might not be generated
|
65 |
raise gr.Error("The API did not return audio data. Please check your text or try again.")
|
66 |
|
67 |
except Exception as e:
|
@@ -74,46 +83,52 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
|
74 |
gr.Markdown(
|
75 |
"""
|
76 |
# ✨ Gemini Text-to-Speech Synthesizer
|
77 |
-
This app uses
|
78 |
-
Just enter the text
|
79 |
"""
|
80 |
)
|
81 |
|
82 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
text_input = gr.Textbox(
|
84 |
label="Text to Synthesize",
|
85 |
placeholder="Hello! Welcome to the text-to-speech demonstration.",
|
86 |
lines=4,
|
87 |
)
|
88 |
|
89 |
-
|
|
|
|
|
|
|
90 |
submit_btn = gr.Button("Generate Speech", variant="primary")
|
91 |
|
92 |
-
# Component to display the generated audio.
|
93 |
audio_output = gr.Audio(label="Generated Audio", type="filepath")
|
94 |
|
95 |
-
# Connect the button click event to the core function
|
96 |
-
# The API key is now handled internally and not needed as an input.
|
97 |
submit_btn.click(
|
98 |
fn=synthesize_speech,
|
99 |
-
inputs=[text_input],
|
100 |
outputs=audio_output
|
101 |
)
|
102 |
|
103 |
-
# Provide example text for users to try easily.
|
104 |
gr.Examples(
|
105 |
examples=[
|
106 |
-
"The weather is wonderful today, perfect for a walk in the park.",
|
107 |
-
"
|
108 |
-
"
|
109 |
-
"This is a demonstration of high-quality speech synthesis."
|
110 |
],
|
111 |
-
inputs=[text_input],
|
112 |
-
label="Example Prompts"
|
113 |
)
|
114 |
|
115 |
# --- Main execution block ---
|
116 |
-
# To deploy, push this file and a requirements.txt to a Hugging Face Space
|
117 |
-
# and set the GOOGLE_API_KEY in the repository secrets.
|
118 |
if __name__ == "__main__":
|
119 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import google.generativeai as genai
|
3 |
+
from google.generativeai import types
|
4 |
import time
|
5 |
import os
|
6 |
+
import wave
|
7 |
|
8 |
# --- Load API Key from Hugging Face Secrets ---
|
9 |
+
# For this to work on Hugging Face Spaces, you must go to your Space's
|
10 |
# settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
|
11 |
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
12 |
|
13 |
+
# --- Helper Functions ---
|
14 |
+
def create_unique_wav_file(pcm_data, channels=1, rate=24000, sample_width=2):
|
15 |
+
"""Saves PCM audio data to a uniquely named WAV file and returns the path."""
|
|
|
16 |
output_dir = "audio_outputs"
|
17 |
os.makedirs(output_dir, exist_ok=True)
|
18 |
|
|
|
19 |
timestamp = int(time.time())
|
20 |
file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
|
21 |
|
|
|
22 |
try:
|
23 |
+
with wave.open(file_name, "wb") as wf:
|
24 |
+
wf.setnchannels(channels)
|
25 |
+
wf.setsampwidth(sample_width)
|
26 |
+
wf.setframerate(rate)
|
27 |
+
wf.writeframes(pcm_data)
|
28 |
return file_name
|
29 |
except Exception as e:
|
30 |
print(f"Error saving wave file: {e}")
|
31 |
raise gr.Error(f"Could not save audio file. Error: {e}")
|
32 |
|
33 |
+
# --- Core API Logic (Rewritten based on new documentation) ---
|
34 |
+
def synthesize_speech(text, voice):
|
|
|
35 |
"""
|
36 |
+
Synthesizes speech from text using the Gemini API's native TTS capabilities.
|
|
|
37 |
"""
|
38 |
# 1. Validate Inputs (API Key and Text)
|
39 |
if not GOOGLE_API_KEY:
|
40 |
raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
|
41 |
if not text or not text.strip():
|
42 |
raise gr.Error("Please enter some text to synthesize.")
|
43 |
+
if not voice:
|
44 |
+
raise gr.Error("Please select a voice.")
|
45 |
|
46 |
try:
|
47 |
+
# 2. Configure the Gemini client directly
|
48 |
+
client = genai.Client(api_key=GOOGLE_API_KEY)
|
|
|
|
|
|
|
|
|
49 |
|
50 |
+
# 3. Construct the API call as per the new TTS documentation
|
51 |
+
prompt = f"Say cheerfully: {text}"
|
52 |
|
53 |
+
response = client.models.generate_content(
|
54 |
+
model="gemini-2.5-flash-preview-tts",
|
55 |
+
contents=prompt,
|
56 |
+
config=types.GenerateContentConfig(
|
57 |
+
response_modalities=["AUDIO"],
|
58 |
+
speech_config=types.SpeechConfig(
|
59 |
+
voice_config=types.VoiceConfig(
|
60 |
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
61 |
+
voice_name=voice,
|
62 |
+
)
|
63 |
+
)
|
64 |
+
),
|
65 |
+
)
|
66 |
+
)
|
67 |
|
68 |
+
# 4. Extract audio data from the new response structure
|
69 |
+
if response.candidates and response.candidates[0].content.parts:
|
70 |
+
audio_data = response.candidates[0].content.parts[0].inline_data.data
|
71 |
+
audio_file_path = create_unique_wav_file(audio_data)
|
72 |
return audio_file_path
|
73 |
else:
|
|
|
74 |
raise gr.Error("The API did not return audio data. Please check your text or try again.")
|
75 |
|
76 |
except Exception as e:
|
|
|
83 |
gr.Markdown(
|
84 |
"""
|
85 |
# ✨ Gemini Text-to-Speech Synthesizer
|
86 |
+
This app uses a Google AI API key stored securely in Hugging Face secrets.
|
87 |
+
Just enter the text, choose a voice, and generate speech!
|
88 |
"""
|
89 |
)
|
90 |
|
91 |
+
# List of available voices from the documentation
|
92 |
+
voice_options = [
|
93 |
+
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
|
94 |
+
"Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
|
95 |
+
"Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
|
96 |
+
"Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
|
97 |
+
"Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
|
98 |
+
]
|
99 |
+
|
100 |
+
# UI Components
|
101 |
text_input = gr.Textbox(
|
102 |
label="Text to Synthesize",
|
103 |
placeholder="Hello! Welcome to the text-to-speech demonstration.",
|
104 |
lines=4,
|
105 |
)
|
106 |
|
107 |
+
voice_dropdown = gr.Dropdown(
|
108 |
+
voice_options, label="Choose a Voice", value="Kore"
|
109 |
+
)
|
110 |
+
|
111 |
submit_btn = gr.Button("Generate Speech", variant="primary")
|
112 |
|
|
|
113 |
audio_output = gr.Audio(label="Generated Audio", type="filepath")
|
114 |
|
115 |
+
# Connect the button click event to the core function
|
|
|
116 |
submit_btn.click(
|
117 |
fn=synthesize_speech,
|
118 |
+
inputs=[text_input, voice_dropdown],
|
119 |
outputs=audio_output
|
120 |
)
|
121 |
|
|
|
122 |
gr.Examples(
|
123 |
examples=[
|
124 |
+
["The weather is wonderful today, perfect for a walk in the park.", "Puck"],
|
125 |
+
["This is a demonstration of high-quality speech synthesis.", "Charon"],
|
126 |
+
["By the pricking of my thumbs, something wicked this way comes.", "Enceladus"],
|
|
|
127 |
],
|
128 |
+
inputs=[text_input, voice_dropdown],
|
129 |
+
label="Example Prompts & Voices"
|
130 |
)
|
131 |
|
132 |
# --- Main execution block ---
|
|
|
|
|
133 |
if __name__ == "__main__":
|
134 |
iface.launch()
|