Athspi commited on
Commit
ee8b748
·
verified ·
1 Parent(s): b4357ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -46
app.py CHANGED
@@ -1,67 +1,76 @@
1
  import gradio as gr
2
  import google.generativeai as genai
 
3
  import time
4
  import os
 
5
 
6
  # --- Load API Key from Hugging Face Secrets ---
7
- # IMPORTANT: For this to work on Hugging Face Spaces, you must go to your Space's
8
  # settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
9
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
10
 
11
- # --- Helper Function ---
12
- def create_unique_wav_file(audio_data):
13
- """Saves audio data to a uniquely named WAV file and returns the path."""
14
- # Create a directory to store audio outputs if it doesn't exist
15
  output_dir = "audio_outputs"
16
  os.makedirs(output_dir, exist_ok=True)
17
 
18
- # Generate a unique filename using a timestamp
19
  timestamp = int(time.time())
20
  file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
21
 
22
- # The API returns a complete WAV file, so we just write the bytes directly.
23
  try:
24
- with open(file_name, 'wb') as f:
25
- f.write(audio_data)
 
 
 
26
  return file_name
27
  except Exception as e:
28
  print(f"Error saving wave file: {e}")
29
  raise gr.Error(f"Could not save audio file. Error: {e}")
30
 
31
-
32
- # --- Core API Logic ---
33
- def synthesize_speech(text):
34
  """
35
- Synthesizes speech from text using the Gemini API.
36
- This function uses the API key loaded from Hugging Face secrets.
37
  """
38
  # 1. Validate Inputs (API Key and Text)
39
  if not GOOGLE_API_KEY:
40
  raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
41
  if not text or not text.strip():
42
  raise gr.Error("Please enter some text to synthesize.")
 
 
43
 
44
  try:
45
- # 2. Configure the Gemini API with the loaded key
46
- genai.configure(api_key=GOOGLE_API_KEY)
47
-
48
- # 3. Call the Text-to-Speech Model
49
- # We use the 'tts-1' model which is optimized for this task.
50
- model = genai.GenerativeModel(model_name='tts-1')
51
 
52
- # The API can be instructed on tone and style directly in the prompt.
53
- prompt = f"Speak the following text in a cheerful and friendly voice: '{text}'"
54
 
55
- # The tts-1 model implicitly returns audio/wav format.
56
- response = model.generate_content(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- # 4. Process the Response and Save the Audio File
59
- # The audio data is conveniently located in the `audio_content` attribute.
60
- if response.audio_content:
61
- audio_file_path = create_unique_wav_file(response.audio_content)
62
  return audio_file_path
63
  else:
64
- # Handle cases where audio might not be generated
65
  raise gr.Error("The API did not return audio data. Please check your text or try again.")
66
 
67
  except Exception as e:
@@ -74,46 +83,52 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
74
  gr.Markdown(
75
  """
76
  # ✨ Gemini Text-to-Speech Synthesizer
77
- This app uses an API key stored securely in Hugging Face secrets.
78
- Just enter the text you want to convert to speech!
79
  """
80
  )
81
 
82
- # Input for the text to be synthesized.
 
 
 
 
 
 
 
 
 
83
  text_input = gr.Textbox(
84
  label="Text to Synthesize",
85
  placeholder="Hello! Welcome to the text-to-speech demonstration.",
86
  lines=4,
87
  )
88
 
89
- # Button to trigger the synthesis process.
 
 
 
90
  submit_btn = gr.Button("Generate Speech", variant="primary")
91
 
92
- # Component to display the generated audio.
93
  audio_output = gr.Audio(label="Generated Audio", type="filepath")
94
 
95
- # Connect the button click event to the core function.
96
- # The API key is now handled internally and not needed as an input.
97
  submit_btn.click(
98
  fn=synthesize_speech,
99
- inputs=[text_input],
100
  outputs=audio_output
101
  )
102
 
103
- # Provide example text for users to try easily.
104
  gr.Examples(
105
  examples=[
106
- "The weather is wonderful today, perfect for a walk in the park.",
107
- "I am so excited to try out this new text-to-speech feature!",
108
- "Congratulations on your amazing achievement!",
109
- "This is a demonstration of high-quality speech synthesis."
110
  ],
111
- inputs=[text_input],
112
- label="Example Prompts"
113
  )
114
 
115
  # --- Main execution block ---
116
- # To deploy, push this file and a requirements.txt to a Hugging Face Space
117
- # and set the GOOGLE_API_KEY in the repository secrets.
118
  if __name__ == "__main__":
119
  iface.launch()
 
1
  import gradio as gr
2
  import google.generativeai as genai
3
+ from google.generativeai import types
4
  import time
5
  import os
6
+ import wave
7
 
8
  # --- Load API Key from Hugging Face Secrets ---
9
+ # For this to work on Hugging Face Spaces, you must go to your Space's
10
  # settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
11
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
12
 
13
+ # --- Helper Functions ---
14
+ def create_unique_wav_file(pcm_data, channels=1, rate=24000, sample_width=2):
15
+ """Saves PCM audio data to a uniquely named WAV file and returns the path."""
 
16
  output_dir = "audio_outputs"
17
  os.makedirs(output_dir, exist_ok=True)
18
 
 
19
  timestamp = int(time.time())
20
  file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
21
 
 
22
  try:
23
+ with wave.open(file_name, "wb") as wf:
24
+ wf.setnchannels(channels)
25
+ wf.setsampwidth(sample_width)
26
+ wf.setframerate(rate)
27
+ wf.writeframes(pcm_data)
28
  return file_name
29
  except Exception as e:
30
  print(f"Error saving wave file: {e}")
31
  raise gr.Error(f"Could not save audio file. Error: {e}")
32
 
33
+ # --- Core API Logic (Rewritten based on new documentation) ---
34
+ def synthesize_speech(text, voice):
 
35
  """
36
+ Synthesizes speech from text using the Gemini API's native TTS capabilities.
 
37
  """
38
  # 1. Validate Inputs (API Key and Text)
39
  if not GOOGLE_API_KEY:
40
  raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
41
  if not text or not text.strip():
42
  raise gr.Error("Please enter some text to synthesize.")
43
+ if not voice:
44
+ raise gr.Error("Please select a voice.")
45
 
46
  try:
47
+ # 2. Configure the Gemini client directly
48
+ client = genai.Client(api_key=GOOGLE_API_KEY)
 
 
 
 
49
 
50
+ # 3. Construct the API call as per the new TTS documentation
51
+ prompt = f"Say cheerfully: {text}"
52
 
53
+ response = client.models.generate_content(
54
+ model="gemini-2.5-flash-preview-tts",
55
+ contents=prompt,
56
+ config=types.GenerateContentConfig(
57
+ response_modalities=["AUDIO"],
58
+ speech_config=types.SpeechConfig(
59
+ voice_config=types.VoiceConfig(
60
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
61
+ voice_name=voice,
62
+ )
63
+ )
64
+ ),
65
+ )
66
+ )
67
 
68
+ # 4. Extract audio data from the new response structure
69
+ if response.candidates and response.candidates[0].content.parts:
70
+ audio_data = response.candidates[0].content.parts[0].inline_data.data
71
+ audio_file_path = create_unique_wav_file(audio_data)
72
  return audio_file_path
73
  else:
 
74
  raise gr.Error("The API did not return audio data. Please check your text or try again.")
75
 
76
  except Exception as e:
 
83
  gr.Markdown(
84
  """
85
  # ✨ Gemini Text-to-Speech Synthesizer
86
+ This app uses a Google AI API key stored securely in Hugging Face secrets.
87
+ Just enter the text, choose a voice, and generate speech!
88
  """
89
  )
90
 
91
+ # List of available voices from the documentation
92
+ voice_options = [
93
+ "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
94
+ "Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
95
+ "Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
96
+ "Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
97
+ "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
98
+ ]
99
+
100
+ # UI Components
101
  text_input = gr.Textbox(
102
  label="Text to Synthesize",
103
  placeholder="Hello! Welcome to the text-to-speech demonstration.",
104
  lines=4,
105
  )
106
 
107
+ voice_dropdown = gr.Dropdown(
108
+ voice_options, label="Choose a Voice", value="Kore"
109
+ )
110
+
111
  submit_btn = gr.Button("Generate Speech", variant="primary")
112
 
 
113
  audio_output = gr.Audio(label="Generated Audio", type="filepath")
114
 
115
+ # Connect the button click event to the core function
 
116
  submit_btn.click(
117
  fn=synthesize_speech,
118
+ inputs=[text_input, voice_dropdown],
119
  outputs=audio_output
120
  )
121
 
 
122
  gr.Examples(
123
  examples=[
124
+ ["The weather is wonderful today, perfect for a walk in the park.", "Puck"],
125
+ ["This is a demonstration of high-quality speech synthesis.", "Charon"],
126
+ ["By the pricking of my thumbs, something wicked this way comes.", "Enceladus"],
 
127
  ],
128
+ inputs=[text_input, voice_dropdown],
129
+ label="Example Prompts & Voices"
130
  )
131
 
132
  # --- Main execution block ---
 
 
133
  if __name__ == "__main__":
134
  iface.launch()