File size: 5,257 Bytes
ab5a1ff 68d8b0d 059047d ab5a1ff 68d8b0d ee8b748 ab5a1ff b4357ba ee8b748 b4357ba ee8b748 68d8b0d ab5a1ff 68d8b0d ee8b748 68d8b0d e4ca1d6 ee8b748 68d8b0d ee8b748 68d8b0d b4357ba 68d8b0d ee8b748 68d8b0d 059047d 68d8b0d 059047d e4ca1d6 059047d 68d8b0d e4ca1d6 059047d ee8b748 e4ca1d6 ee8b748 68d8b0d 059047d ee8b748 68d8b0d ab5a1ff 68d8b0d b4357ba ab5a1ff 68d8b0d ee8b748 68d8b0d ee8b748 b4357ba 68d8b0d ee8b748 68d8b0d ee8b748 68d8b0d ee8b748 68d8b0d ee8b748 68d8b0d ee8b748 68d8b0d ab5a1ff 68d8b0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
import time
import os
import wave
# --- Load API Key from Hugging Face Secrets ---
# For this to work on Hugging Face Spaces, you must go to your Space's
# settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
# --- Helper Functions ---
def create_unique_wav_file(pcm_data, channels=1, rate=24000, sample_width=2):
"""Saves PCM audio data to a uniquely named WAV file and returns the path."""
output_dir = "audio_outputs"
os.makedirs(output_dir, exist_ok=True)
timestamp = int(time.time())
file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
try:
with wave.open(file_name, "wb") as wf:
wf.setnchannels(channels)
wf.setsampwidth(sample_width)
wf.setframerate(rate)
wf.writeframes(pcm_data)
return file_name
except Exception as e:
print(f"Error saving wave file: {e}")
raise gr.Error(f"Could not save audio file. Error: {e}")
# --- Core API Logic (Corrected API Call Structure) ---
def synthesize_speech(text, voice):
"""
Synthesizes speech from text using the Gemini API's native TTS capabilities.
"""
# 1. Validate Inputs (API Key and Text)
if not GOOGLE_API_KEY:
raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
if not text or not text.strip():
raise gr.Error("Please enter some text to synthesize.")
if not voice:
raise gr.Error("Please select a voice.")
try:
# 2. Configure the API key once
genai.configure(api_key=GOOGLE_API_KEY)
# 3. Instantiate the correct model
model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-tts")
# 4. Construct the GenerationConfig with ONLY the speech_config
tts_generation_config = GenerationConfig(
speech_config={
"voice_config": {
"prebuilt_voice_config": {
"voice_name": voice
}
}
}
)
# 5. Generate content, passing response_modalities directly
prompt = f"Say cheerfully: {text}"
response = model.generate_content(
contents=prompt,
generation_config=tts_generation_config,
response_modalities=["AUDIO"] # CORRECTED: This is a direct argument
)
# 6. Extract audio data from the response structure
if response.candidates and response.candidates[0].content.parts:
audio_data = response.candidates[0].content.parts[0].inline_data.data
audio_file_path = create_unique_wav_file(audio_data)
return audio_file_path
else:
raise gr.Error("The API did not return audio data. Please check your text or try again.")
except Exception as e:
# Provide a more informative error message in the UI.
print(f"An error occurred: {e}")
raise gr.Error(f"Failed to synthesize speech. Please check your network connection and that your API key is valid. Error: {e}")
# --- Gradio User Interface ---
with gr.Blocks(theme=gr.themes.Soft()) as iface:
gr.Markdown(
"""
# ✨ Gemini Text-to-Speech Synthesizer
This app uses a Google AI API key stored securely in Hugging Face secrets.
Just enter the text, choose a voice, and generate speech!
"""
)
# List of available voices from the documentation
voice_options = [
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
"Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
"Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
"Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
"Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
]
# UI Components
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Hello! Welcome to the text-to-speech demonstration.",
lines=4,
)
voice_dropdown = gr.Dropdown(
voice_options, label="Choose a Voice", value="Kore"
)
submit_btn = gr.Button("Generate Speech", variant="primary")
audio_output = gr.Audio(label="Generated Audio", type="filepath")
# Connect the button click event to the core function
submit_btn.click(
fn=synthesize_speech,
inputs=[text_input, voice_dropdown],
outputs=audio_output
)
gr.Examples(
examples=[
["The weather is wonderful today, perfect for a walk in the park.", "Puck"],
["This is a demonstration of high-quality speech synthesis.", "Charon"],
["By the pricking of my thumbs, something wicked this way comes.", "Enceladus"],
],
inputs=[text_input, voice_dropdown],
label="Example Prompts & Voices"
)
# --- Main execution block ---
if __name__ == "__main__":
iface.launch()
|