File size: 5,257 Bytes
ab5a1ff
68d8b0d
059047d
ab5a1ff
68d8b0d
ee8b748
ab5a1ff
b4357ba
ee8b748
b4357ba
 
 
ee8b748
 
 
68d8b0d
 
 
ab5a1ff
68d8b0d
 
 
ee8b748
 
 
 
 
68d8b0d
 
 
 
 
e4ca1d6
ee8b748
68d8b0d
ee8b748
68d8b0d
b4357ba
 
 
68d8b0d
 
ee8b748
 
68d8b0d
 
059047d
 
68d8b0d
059047d
 
 
e4ca1d6
 
059047d
 
 
 
 
 
 
 
68d8b0d
e4ca1d6
059047d
 
ee8b748
e4ca1d6
 
ee8b748
68d8b0d
059047d
ee8b748
 
 
68d8b0d
 
 
ab5a1ff
68d8b0d
 
 
b4357ba
ab5a1ff
68d8b0d
 
 
 
 
ee8b748
 
68d8b0d
 
 
ee8b748
 
 
 
 
 
 
 
 
 
b4357ba
 
 
 
 
68d8b0d
ee8b748
 
 
 
68d8b0d
 
 
 
ee8b748
68d8b0d
 
ee8b748
68d8b0d
 
 
 
 
ee8b748
 
 
68d8b0d
ee8b748
 
68d8b0d
ab5a1ff
68d8b0d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
import time
import os
import wave

# --- Load API Key from Hugging Face Secrets ---
# For this to work on Hugging Face Spaces, you must go to your Space's
# settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

# --- Helper Functions ---
def create_unique_wav_file(pcm_data, channels=1, rate=24000, sample_width=2):
    """Saves PCM audio data to a uniquely named WAV file and returns the path."""
    output_dir = "audio_outputs"
    os.makedirs(output_dir, exist_ok=True)
    
    timestamp = int(time.time())
    file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
    
    try:
        with wave.open(file_name, "wb") as wf:
            wf.setnchannels(channels)
            wf.setsampwidth(sample_width)
            wf.setframerate(rate)
            wf.writeframes(pcm_data)
        return file_name
    except Exception as e:
        print(f"Error saving wave file: {e}")
        raise gr.Error(f"Could not save audio file. Error: {e}")

# --- Core API Logic (Corrected API Call Structure) ---
def synthesize_speech(text, voice):
    """
    Synthesizes speech from text using the Gemini API's native TTS capabilities.
    """
    # 1. Validate Inputs (API Key and Text)
    if not GOOGLE_API_KEY:
        raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
    if not text or not text.strip():
        raise gr.Error("Please enter some text to synthesize.")
    if not voice:
        raise gr.Error("Please select a voice.")

    try:
        # 2. Configure the API key once
        genai.configure(api_key=GOOGLE_API_KEY)
        
        # 3. Instantiate the correct model
        model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-tts")

        # 4. Construct the GenerationConfig with ONLY the speech_config
        tts_generation_config = GenerationConfig(
            speech_config={
                "voice_config": {
                    "prebuilt_voice_config": {
                        "voice_name": voice
                    }
                }
            }
        )
        
        # 5. Generate content, passing response_modalities directly
        prompt = f"Say cheerfully: {text}"
        response = model.generate_content(
           contents=prompt,
           generation_config=tts_generation_config,
           response_modalities=["AUDIO"]  # CORRECTED: This is a direct argument
        )
        
        # 6. Extract audio data from the response structure
        if response.candidates and response.candidates[0].content.parts:
            audio_data = response.candidates[0].content.parts[0].inline_data.data
            audio_file_path = create_unique_wav_file(audio_data)
            return audio_file_path
        else:
            raise gr.Error("The API did not return audio data. Please check your text or try again.")

    except Exception as e:
        # Provide a more informative error message in the UI.
        print(f"An error occurred: {e}")
        raise gr.Error(f"Failed to synthesize speech. Please check your network connection and that your API key is valid. Error: {e}")

# --- Gradio User Interface ---
with gr.Blocks(theme=gr.themes.Soft()) as iface:
    gr.Markdown(
        """
        # ✨ Gemini Text-to-Speech Synthesizer
        This app uses a Google AI API key stored securely in Hugging Face secrets. 
        Just enter the text, choose a voice, and generate speech!
        """
    )
    
    # List of available voices from the documentation
    voice_options = [
        "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
        "Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
        "Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
        "Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
        "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
    ]
    
    # UI Components
    text_input = gr.Textbox(
        label="Text to Synthesize",
        placeholder="Hello! Welcome to the text-to-speech demonstration.",
        lines=4,
    )
    
    voice_dropdown = gr.Dropdown(
        voice_options, label="Choose a Voice", value="Kore"
    )
    
    submit_btn = gr.Button("Generate Speech", variant="primary")
    
    audio_output = gr.Audio(label="Generated Audio", type="filepath")

    # Connect the button click event to the core function
    submit_btn.click(
        fn=synthesize_speech,
        inputs=[text_input, voice_dropdown],
        outputs=audio_output
    )
    
    gr.Examples(
        examples=[
            ["The weather is wonderful today, perfect for a walk in the park.", "Puck"],
            ["This is a demonstration of high-quality speech synthesis.", "Charon"],
            ["By the pricking of my thumbs, something wicked this way comes.", "Enceladus"],
        ],
        inputs=[text_input, voice_dropdown],
        label="Example Prompts & Voices"
    )

# --- Main execution block ---
if __name__ == "__main__":
    iface.launch()