import gradio as gr import google.generativeai as genai from google.generativeai.types import GenerationConfig import time import os import wave # --- Load API Key from Hugging Face Secrets --- # For this to work on Hugging Face Spaces, you must go to your Space's # settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value. GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") # --- Helper Functions --- def create_unique_wav_file(pcm_data, channels=1, rate=24000, sample_width=2): """Saves PCM audio data to a uniquely named WAV file and returns the path.""" output_dir = "audio_outputs" os.makedirs(output_dir, exist_ok=True) timestamp = int(time.time()) file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav') try: with wave.open(file_name, "wb") as wf: wf.setnchannels(channels) wf.setsampwidth(sample_width) wf.setframerate(rate) wf.writeframes(pcm_data) return file_name except Exception as e: print(f"Error saving wave file: {e}") raise gr.Error(f"Could not save audio file. Error: {e}") # --- Core API Logic (Corrected API Call Structure) --- def synthesize_speech(text, voice): """ Synthesizes speech from text using the Gemini API's native TTS capabilities. """ # 1. Validate Inputs (API Key and Text) if not GOOGLE_API_KEY: raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.") if not text or not text.strip(): raise gr.Error("Please enter some text to synthesize.") if not voice: raise gr.Error("Please select a voice.") try: # 2. Configure the API key once genai.configure(api_key=GOOGLE_API_KEY) # 3. Instantiate the correct model model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-tts") # 4. Construct the GenerationConfig with ONLY the speech_config tts_generation_config = GenerationConfig( speech_config={ "voice_config": { "prebuilt_voice_config": { "voice_name": voice } } } ) # 5. Generate content, passing response_modalities directly prompt = f"Say cheerfully: {text}" response = model.generate_content( contents=prompt, generation_config=tts_generation_config, response_modalities=["AUDIO"] # CORRECTED: This is a direct argument ) # 6. Extract audio data from the response structure if response.candidates and response.candidates[0].content.parts: audio_data = response.candidates[0].content.parts[0].inline_data.data audio_file_path = create_unique_wav_file(audio_data) return audio_file_path else: raise gr.Error("The API did not return audio data. Please check your text or try again.") except Exception as e: # Provide a more informative error message in the UI. print(f"An error occurred: {e}") raise gr.Error(f"Failed to synthesize speech. Please check your network connection and that your API key is valid. Error: {e}") # --- Gradio User Interface --- with gr.Blocks(theme=gr.themes.Soft()) as iface: gr.Markdown( """ # ✨ Gemini Text-to-Speech Synthesizer This app uses a Google AI API key stored securely in Hugging Face secrets. Just enter the text, choose a voice, and generate speech! """ ) # List of available voices from the documentation voice_options = [ "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede", "Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba", "Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar", "Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi", "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat" ] # UI Components text_input = gr.Textbox( label="Text to Synthesize", placeholder="Hello! Welcome to the text-to-speech demonstration.", lines=4, ) voice_dropdown = gr.Dropdown( voice_options, label="Choose a Voice", value="Kore" ) submit_btn = gr.Button("Generate Speech", variant="primary") audio_output = gr.Audio(label="Generated Audio", type="filepath") # Connect the button click event to the core function submit_btn.click( fn=synthesize_speech, inputs=[text_input, voice_dropdown], outputs=audio_output ) gr.Examples( examples=[ ["The weather is wonderful today, perfect for a walk in the park.", "Puck"], ["This is a demonstration of high-quality speech synthesis.", "Charon"], ["By the pricking of my thumbs, something wicked this way comes.", "Enceladus"], ], inputs=[text_input, voice_dropdown], label="Example Prompts & Voices" ) # --- Main execution block --- if __name__ == "__main__": iface.launch()