Gsgsgsg / app.py
Athspi's picture
Update app.py
b4357ba verified
raw
history blame
4.65 kB
import gradio as gr
import google.generativeai as genai
import time
import os
# --- Load API Key from Hugging Face Secrets ---
# IMPORTANT: For this to work on Hugging Face Spaces, you must go to your Space's
# settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
# --- Helper Function ---
def create_unique_wav_file(audio_data):
"""Saves audio data to a uniquely named WAV file and returns the path."""
# Create a directory to store audio outputs if it doesn't exist
output_dir = "audio_outputs"
os.makedirs(output_dir, exist_ok=True)
# Generate a unique filename using a timestamp
timestamp = int(time.time())
file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
# The API returns a complete WAV file, so we just write the bytes directly.
try:
with open(file_name, 'wb') as f:
f.write(audio_data)
return file_name
except Exception as e:
print(f"Error saving wave file: {e}")
raise gr.Error(f"Could not save audio file. Error: {e}")
# --- Core API Logic ---
def synthesize_speech(text):
"""
Synthesizes speech from text using the Gemini API.
This function uses the API key loaded from Hugging Face secrets.
"""
# 1. Validate Inputs (API Key and Text)
if not GOOGLE_API_KEY:
raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
if not text or not text.strip():
raise gr.Error("Please enter some text to synthesize.")
try:
# 2. Configure the Gemini API with the loaded key
genai.configure(api_key=GOOGLE_API_KEY)
# 3. Call the Text-to-Speech Model
# We use the 'tts-1' model which is optimized for this task.
model = genai.GenerativeModel(model_name='tts-1')
# The API can be instructed on tone and style directly in the prompt.
prompt = f"Speak the following text in a cheerful and friendly voice: '{text}'"
# The tts-1 model implicitly returns audio/wav format.
response = model.generate_content(prompt)
# 4. Process the Response and Save the Audio File
# The audio data is conveniently located in the `audio_content` attribute.
if response.audio_content:
audio_file_path = create_unique_wav_file(response.audio_content)
return audio_file_path
else:
# Handle cases where audio might not be generated
raise gr.Error("The API did not return audio data. Please check your text or try again.")
except Exception as e:
# Provide a more informative error message in the UI.
print(f"An error occurred: {e}")
raise gr.Error(f"Failed to synthesize speech. Please check your network connection and that your API key is valid. Error: {e}")
# --- Gradio User Interface ---
with gr.Blocks(theme=gr.themes.Soft()) as iface:
gr.Markdown(
"""
# ✨ Gemini Text-to-Speech Synthesizer
This app uses an API key stored securely in Hugging Face secrets.
Just enter the text you want to convert to speech!
"""
)
# Input for the text to be synthesized.
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Hello! Welcome to the text-to-speech demonstration.",
lines=4,
)
# Button to trigger the synthesis process.
submit_btn = gr.Button("Generate Speech", variant="primary")
# Component to display the generated audio.
audio_output = gr.Audio(label="Generated Audio", type="filepath")
# Connect the button click event to the core function.
# The API key is now handled internally and not needed as an input.
submit_btn.click(
fn=synthesize_speech,
inputs=[text_input],
outputs=audio_output
)
# Provide example text for users to try easily.
gr.Examples(
examples=[
"The weather is wonderful today, perfect for a walk in the park.",
"I am so excited to try out this new text-to-speech feature!",
"Congratulations on your amazing achievement!",
"This is a demonstration of high-quality speech synthesis."
],
inputs=[text_input],
label="Example Prompts"
)
# --- Main execution block ---
# To deploy, push this file and a requirements.txt to a Hugging Face Space
# and set the GOOGLE_API_KEY in the repository secrets.
if __name__ == "__main__":
iface.launch()