Athspi commited on
Commit
68d8b0d
·
verified ·
1 Parent(s): ab5a1ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -46
app.py CHANGED
@@ -1,53 +1,123 @@
1
  import gradio as gr
 
2
  import time
3
- import wave
4
- from google import genai
5
- from google.genai import types
6
- from google.colab import userdata
7
-
8
- # Set up the wave file to save the output:
9
- def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
10
- with wave.open(filename, "wb") as wf:
11
- wf.setnchannels(channels)
12
- wf.setsampwidth(sample_width)
13
- wf.setframerate(rate)
14
- wf.writeframes(pcm)
15
-
16
- # Retrieve the API key from Colab's Secrets Manage
17
- GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
18
- client = genai.Client(api_key=GOOGLE_API_KEY)
19
-
20
- def synthesize_speech(text):
21
- """Synthesizes speech from the given text and saves it to a wave file."""
22
- response = client.models.generate_content(
23
- model="gemini-2.5-flash-preview-tts",
24
- contents=f"Say cheerfully: {text}",
25
- config=types.GenerateContentConfig(
26
- response_modalities=["AUDIO"],
27
- speech_config=types.SpeechConfig(
28
- voice_config=types.VoiceConfig(
29
- prebuilt_voice_config=types.PrebuiltVoiceConfig(
30
- voice_name='Kore',
31
- )
32
- )
33
- ),
34
- )
35
- )
36
 
37
- data = response.candidates[0].content.parts[0].inline_data.data
38
-
39
- # Create a dynamic filename using a timestamp
 
 
 
 
 
40
  timestamp = int(time.time())
41
- file_name = f'out_{timestamp}.wav'
42
- wave_file(file_name, data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- return file_name
 
 
 
45
 
46
- iface = gr.Interface(
47
- fn=synthesize_speech,
48
- inputs=gr.Textbox(label="Enter text for speech synthesis"),
49
- outputs=gr.Audio(label="Generated Audio"),
50
- title="Text-to-Speech Interface"
51
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- iface.launch()
 
 
 
 
1
  import gradio as gr
2
+ import google.generativeai as genai
3
  import time
4
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # --- Helper Function ---
7
+ def create_unique_wav_file(audio_data):
8
+ """Saves audio data to a uniquely named WAV file and returns the path."""
9
+ # Create a directory to store audio outputs if it doesn't exist
10
+ output_dir = "audio_outputs"
11
+ os.makedirs(output_dir, exist_ok=True)
12
+
13
+ # Generate a unique filename using a timestamp
14
  timestamp = int(time.time())
15
+ file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
16
+
17
+ # The API returns a complete WAV file, so we just write the bytes directly.
18
+ try:
19
+ with open(file_name, 'wb') as f:
20
+ f.write(audio_data)
21
+ return file_name
22
+ except Exception as e:
23
+ print(f"Error saving wave file: {e}")
24
+ raise gr.Error(f"Could not save audio file. Error: {e}")
25
+
26
+
27
+ # --- Core API Logic ---
28
+ def synthesize_speech(api_key, text):
29
+ """
30
+ Synthesizes speech from text using the Gemini API.
31
+ This function takes an API key and text, validates them, configures the
32
+ Gemini client, calls the Text-to-Speech API, and saves the resulting audio.
33
+ """
34
+ # 1. Validate Inputs
35
+ if not api_key:
36
+ raise gr.Error("API Key is required. Please enter your Google AI API Key.")
37
+ if not text or not text.strip():
38
+ raise gr.Error("Please enter some text to synthesize.")
39
+
40
+ try:
41
+ # 2. Configure the Gemini API
42
+ # This sets up the API key for all subsequent genai calls.
43
+ genai.configure(api_key=api_key)
44
+
45
+ # 3. Call the Text-to-Speech Model
46
+ # We use the 'tts-1' model which is optimized for this task.
47
+ # The prompt itself instructs the model on the desired tone.
48
+ model = genai.GenerativeModel(model_name='tts-1')
49
+
50
+ # The API can be instructed on tone and style directly in the prompt.
51
+ prompt = f"Speak the following text in a cheerful and friendly voice: '{text}'"
52
+
53
+ response = model.generate_content(prompt, response_mime_type="audio/wav")
54
+
55
+ # 4. Process the Response and Save the Audio File
56
+ # The audio data is conveniently located in the `audio_content` attribute.
57
+ if response.audio_content:
58
+ audio_file_path = create_unique_wav_file(response.audio_content)
59
+ return audio_file_path
60
+ else:
61
+ # Handle cases where audio might not be generated
62
+ raise gr.Error("The API did not return audio data. Please check your text or try again.")
63
 
64
+ except Exception as e:
65
+ # Provide a more informative error message in the UI.
66
+ print(f"An error occurred: {e}")
67
+ raise gr.Error(f"Failed to synthesize speech. Please check your API key and network connection. Error: {e}")
68
 
69
+ # --- Gradio User Interface ---
70
+ with gr.Blocks(theme=gr.themes.Soft()) as iface:
71
+ gr.Markdown(
72
+ """
73
+ # ✨ Gemini Text-to-Speech Synthesizer
74
+ Enter your Google AI API Key and the text you want to convert to speech.
75
+ The audio will be generated with a cheerful tone.
76
+ """
77
+ )
78
+
79
+ with gr.Row():
80
+ # Input for the user's API key. Type="password" hides the input.
81
+ api_key_input = gr.Textbox(
82
+ label="Google AI API Key",
83
+ type="password",
84
+ placeholder="Enter your API key here...",
85
+ scale=1
86
+ )
87
+ # Input for the text to be synthesized.
88
+ text_input = gr.Textbox(
89
+ label="Text to Synthesize",
90
+ placeholder="Hello! Welcome to the text-to-speech demonstration.",
91
+ lines=3,
92
+ scale=2
93
+ )
94
+
95
+ # Button to trigger the synthesis process.
96
+ submit_btn = gr.Button("Generate Speech", variant="primary")
97
+
98
+ # Component to display the generated audio.
99
+ audio_output = gr.Audio(label="Generated Audio", type="filepath")
100
+
101
+ # Connect the button click event to the core function.
102
+ submit_btn.click(
103
+ fn=synthesize_speech,
104
+ inputs=[api_key_input, text_input],
105
+ outputs=audio_output
106
+ )
107
+
108
+ # Provide example text for users to try easily.
109
+ gr.Examples(
110
+ examples=[
111
+ "The weather is wonderful today, perfect for a walk in the park.",
112
+ "I am so excited to try out this new text-to-speech feature!",
113
+ "Congratulations on your amazing achievement!",
114
+ "This is a demonstration of high-quality speech synthesis."
115
+ ],
116
+ inputs=[text_input],
117
+ label="Example Prompts"
118
+ )
119
 
120
+ # --- Main execution block ---
121
+ # To run this script, save it as app.py and run `python app.py` in your terminal.
122
+ if __name__ == "__main__":
123
+ iface.launch()