Spaces:

Athspi
/

Gsgsgsg

Running

App Files Files Community

Athspi commited on 14 days ago

Commit

ee8b748

verified ·

1 Parent(s): b4357ba

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -46

app.py CHANGED Viewed

@@ -1,67 +1,76 @@
 import gradio as gr
 import google.generativeai as genai
 import time
 import os
 # --- Load API Key from Hugging Face Secrets ---
-# IMPORTANT: For this to work on Hugging Face Spaces, you must go to your Space's
 # settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
 GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
-# --- Helper Function ---
-def create_unique_wav_file(audio_data):
-    """Saves audio data to a uniquely named WAV file and returns the path."""
-    # Create a directory to store audio outputs if it doesn't exist
     output_dir = "audio_outputs"
     os.makedirs(output_dir, exist_ok=True)
-    # Generate a unique filename using a timestamp
     timestamp = int(time.time())
     file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
-    # The API returns a complete WAV file, so we just write the bytes directly.
     try:
-        with open(file_name, 'wb') as f:
-            f.write(audio_data)
         return file_name
     except Exception as e:
         print(f"Error saving wave file: {e}")
         raise gr.Error(f"Could not save audio file. Error: {e}")
-# --- Core API Logic ---
-def synthesize_speech(text):
     """
-    Synthesizes speech from text using the Gemini API.
-    This function uses the API key loaded from Hugging Face secrets.
     """
     # 1. Validate Inputs (API Key and Text)
     if not GOOGLE_API_KEY:
         raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
     if not text or not text.strip():
         raise gr.Error("Please enter some text to synthesize.")
     try:
-        # 2. Configure the Gemini API with the loaded key
-        genai.configure(api_key=GOOGLE_API_KEY)
-        # 3. Call the Text-to-Speech Model
-        # We use the 'tts-1' model which is optimized for this task.
-        model = genai.GenerativeModel(model_name='tts-1')
-        # The API can be instructed on tone and style directly in the prompt.
-        prompt = f"Speak the following text in a cheerful and friendly voice: '{text}'"
-        # The tts-1 model implicitly returns audio/wav format.
-        response = model.generate_content(prompt)
-        # 4. Process the Response and Save the Audio File
-        # The audio data is conveniently located in the `audio_content` attribute.
-        if response.audio_content:
-            audio_file_path = create_unique_wav_file(response.audio_content)
             return audio_file_path
         else:
-            # Handle cases where audio might not be generated
             raise gr.Error("The API did not return audio data. Please check your text or try again.")
     except Exception as e:
@@ -74,46 +83,52 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
     gr.Markdown(
         """
         # ✨ Gemini Text-to-Speech Synthesizer
-        This app uses an API key stored securely in Hugging Face secrets.
-        Just enter the text you want to convert to speech!
         """
     )
-    # Input for the text to be synthesized.
     text_input = gr.Textbox(
         label="Text to Synthesize",
         placeholder="Hello! Welcome to the text-to-speech demonstration.",
         lines=4,
     )
-    # Button to trigger the synthesis process.
     submit_btn = gr.Button("Generate Speech", variant="primary")
-    # Component to display the generated audio.
     audio_output = gr.Audio(label="Generated Audio", type="filepath")
-    # Connect the button click event to the core function.
-    # The API key is now handled internally and not needed as an input.
     submit_btn.click(
         fn=synthesize_speech,
-        inputs=[text_input],
         outputs=audio_output
     )
-    # Provide example text for users to try easily.
     gr.Examples(
         examples=[
-            "The weather is wonderful today, perfect for a walk in the park.",
-            "I am so excited to try out this new text-to-speech feature!",
-            "Congratulations on your amazing achievement!",
-            "This is a demonstration of high-quality speech synthesis."
         ],
-        inputs=[text_input],
-        label="Example Prompts"
     )
 # --- Main execution block ---
-# To deploy, push this file and a requirements.txt to a Hugging Face Space
-# and set the GOOGLE_API_KEY in the repository secrets.
 if __name__ == "__main__":
     iface.launch()

 import gradio as gr
 import google.generativeai as genai
+from google.generativeai import types
 import time
 import os
+import wave
 # --- Load API Key from Hugging Face Secrets ---
+# For this to work on Hugging Face Spaces, you must go to your Space's
 # settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
 GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
+# --- Helper Functions ---
+def create_unique_wav_file(pcm_data, channels=1, rate=24000, sample_width=2):
+    """Saves PCM audio data to a uniquely named WAV file and returns the path."""
     output_dir = "audio_outputs"
     os.makedirs(output_dir, exist_ok=True)
     timestamp = int(time.time())
     file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
     try:
+        with wave.open(file_name, "wb") as wf:
+            wf.setnchannels(channels)
+            wf.setsampwidth(sample_width)
+            wf.setframerate(rate)
+            wf.writeframes(pcm_data)
         return file_name
     except Exception as e:
         print(f"Error saving wave file: {e}")
         raise gr.Error(f"Could not save audio file. Error: {e}")
+# --- Core API Logic (Rewritten based on new documentation) ---
+def synthesize_speech(text, voice):
     """
+    Synthesizes speech from text using the Gemini API's native TTS capabilities.
     """
     # 1. Validate Inputs (API Key and Text)
     if not GOOGLE_API_KEY:
         raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
     if not text or not text.strip():
         raise gr.Error("Please enter some text to synthesize.")
+    if not voice:
+        raise gr.Error("Please select a voice.")
     try:
+        # 2. Configure the Gemini client directly
+        client = genai.Client(api_key=GOOGLE_API_KEY)
+        # 3. Construct the API call as per the new TTS documentation
+        prompt = f"Say cheerfully: {text}"
+        response = client.models.generate_content(
+           model="gemini-2.5-flash-preview-tts",
+           contents=prompt,
+           config=types.GenerateContentConfig(
+              response_modalities=["AUDIO"],
+              speech_config=types.SpeechConfig(
+                 voice_config=types.VoiceConfig(
+                    prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                       voice_name=voice,
+                    )
+                 )
+              ),
+           )
+        )
+        # 4. Extract audio data from the new response structure
+        if response.candidates and response.candidates[0].content.parts:
+            audio_data = response.candidates[0].content.parts[0].inline_data.data
+            audio_file_path = create_unique_wav_file(audio_data)
             return audio_file_path
         else:
             raise gr.Error("The API did not return audio data. Please check your text or try again.")
     except Exception as e:
     gr.Markdown(
         """
         # ✨ Gemini Text-to-Speech Synthesizer
+        This app uses a Google AI API key stored securely in Hugging Face secrets.
+        Just enter the text, choose a voice, and generate speech!
         """
     )
+    # List of available voices from the documentation
+    voice_options = [
+        "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
+        "Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
+        "Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
+        "Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
+        "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
+    ]
+    # UI Components
     text_input = gr.Textbox(
         label="Text to Synthesize",
         placeholder="Hello! Welcome to the text-to-speech demonstration.",
         lines=4,
     )
+    voice_dropdown = gr.Dropdown(
+        voice_options, label="Choose a Voice", value="Kore"
+    )
     submit_btn = gr.Button("Generate Speech", variant="primary")
     audio_output = gr.Audio(label="Generated Audio", type="filepath")
+    # Connect the button click event to the core function
     submit_btn.click(
         fn=synthesize_speech,
+        inputs=[text_input, voice_dropdown],
         outputs=audio_output
     )
     gr.Examples(
         examples=[
+            ["The weather is wonderful today, perfect for a walk in the park.", "Puck"],
+            ["This is a demonstration of high-quality speech synthesis.", "Charon"],
+            ["By the pricking of my thumbs, something wicked this way comes.", "Enceladus"],
         ],
+        inputs=[text_input, voice_dropdown],
+        label="Example Prompts & Voices"
     )
 # --- Main execution block ---
 if __name__ == "__main__":
     iface.launch()