import gradio as gr
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
import time
import os
import wave

# --- Load API Key from Hugging Face Secrets ---
# For this to work on Hugging Face Spaces, you must go to your Space's
# settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

# --- Helper Functions ---
def create_unique_wav_file(pcm_data, channels=1, rate=24000, sample_width=2):
    """Saves PCM audio data to a uniquely named WAV file and returns the path."""
    output_dir = "audio_outputs"
    os.makedirs(output_dir, exist_ok=True)
    
    timestamp = int(time.time())
    file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
    
    try:
        with wave.open(file_name, "wb") as wf:
            wf.setnchannels(channels)
            wf.setsampwidth(sample_width)
            wf.setframerate(rate)
            wf.writeframes(pcm_data)
        return file_name
    except Exception as e:
        print(f"Error saving wave file: {e}")
        raise gr.Error(f"Could not save audio file. Error: {e}")

# --- Core API Logic (Corrected API Call Structure) ---
def synthesize_speech(text, voice):
    """
    Synthesizes speech from text using the Gemini API's native TTS capabilities.
    """
    # 1. Validate Inputs (API Key and Text)
    if not GOOGLE_API_KEY:
        raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
    if not text or not text.strip():
        raise gr.Error("Please enter some text to synthesize.")
    if not voice:
        raise gr.Error("Please select a voice.")

    try:
        # 2. Configure the API key once
        genai.configure(api_key=GOOGLE_API_KEY)
        
        # 3. Instantiate the correct model
        model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-tts")

        # 4. Construct the GenerationConfig with ONLY the speech_config
        tts_generation_config = GenerationConfig(
            speech_config={
                "voice_config": {
                    "prebuilt_voice_config": {
                        "voice_name": voice
                    }
                }
            }
        )
        
        # 5. Generate content, passing response_modalities directly
        prompt = f"Say cheerfully: {text}"
        response = model.generate_content(
           contents=prompt,
           generation_config=tts_generation_config,
           response_modalities=["AUDIO"]  # CORRECTED: This is a direct argument
        )
        
        # 6. Extract audio data from the response structure
        if response.candidates and response.candidates[0].content.parts:
            audio_data = response.candidates[0].content.parts[0].inline_data.data
            audio_file_path = create_unique_wav_file(audio_data)
            return audio_file_path
        else:
            raise gr.Error("The API did not return audio data. Please check your text or try again.")

    except Exception as e:
        # Provide a more informative error message in the UI.
        print(f"An error occurred: {e}")
        raise gr.Error(f"Failed to synthesize speech. Please check your network connection and that your API key is valid. Error: {e}")

# --- Gradio User Interface ---
with gr.Blocks(theme=gr.themes.Soft()) as iface:
    gr.Markdown(
        """
        # ✨ Gemini Text-to-Speech Synthesizer
        This app uses a Google AI API key stored securely in Hugging Face secrets. 
        Just enter the text, choose a voice, and generate speech!
        """
    )
    
    # List of available voices from the documentation
    voice_options = [
        "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
        "Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
        "Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
        "Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
        "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
    ]
    
    # UI Components
    text_input = gr.Textbox(
        label="Text to Synthesize",
        placeholder="Hello! Welcome to the text-to-speech demonstration.",
        lines=4,
    )
    
    voice_dropdown = gr.Dropdown(
        voice_options, label="Choose a Voice", value="Kore"
    )
    
    submit_btn = gr.Button("Generate Speech", variant="primary")
    
    audio_output = gr.Audio(label="Generated Audio", type="filepath")

    # Connect the button click event to the core function
    submit_btn.click(
        fn=synthesize_speech,
        inputs=[text_input, voice_dropdown],
        outputs=audio_output
    )
    
    gr.Examples(
        examples=[
            ["The weather is wonderful today, perfect for a walk in the park.", "Puck"],
            ["This is a demonstration of high-quality speech synthesis.", "Charon"],
            ["By the pricking of my thumbs, something wicked this way comes.", "Enceladus"],
        ],
        inputs=[text_input, voice_dropdown],
        label="Example Prompts & Voices"
    )

# --- Main execution block ---
if __name__ == "__main__":
    iface.launch()