File size: 4,649 Bytes
ab5a1ff
68d8b0d
ab5a1ff
68d8b0d
ab5a1ff
b4357ba
 
 
 
 
68d8b0d
 
 
 
 
 
 
 
ab5a1ff
68d8b0d
 
 
 
 
 
 
 
 
 
 
 
 
b4357ba
68d8b0d
 
b4357ba
68d8b0d
b4357ba
 
 
68d8b0d
 
 
 
b4357ba
 
68d8b0d
 
 
 
 
 
 
 
b4357ba
 
68d8b0d
 
 
 
 
 
 
 
 
ab5a1ff
68d8b0d
 
 
b4357ba
ab5a1ff
68d8b0d
 
 
 
 
b4357ba
 
68d8b0d
 
 
b4357ba
 
 
 
 
 
68d8b0d
 
 
 
 
 
 
 
b4357ba
68d8b0d
 
b4357ba
68d8b0d
 
 
 
 
 
 
 
 
 
 
 
 
 
ab5a1ff
68d8b0d
b4357ba
 
68d8b0d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr
import google.generativeai as genai
import time
import os

# --- Load API Key from Hugging Face Secrets ---
# IMPORTANT: For this to work on Hugging Face Spaces, you must go to your Space's
# settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

# --- Helper Function ---
def create_unique_wav_file(audio_data):
    """Saves audio data to a uniquely named WAV file and returns the path."""
    # Create a directory to store audio outputs if it doesn't exist
    output_dir = "audio_outputs"
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate a unique filename using a timestamp
    timestamp = int(time.time())
    file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
    
    # The API returns a complete WAV file, so we just write the bytes directly.
    try:
        with open(file_name, 'wb') as f:
            f.write(audio_data)
        return file_name
    except Exception as e:
        print(f"Error saving wave file: {e}")
        raise gr.Error(f"Could not save audio file. Error: {e}")


# --- Core API Logic ---
def synthesize_speech(text):
    """
    Synthesizes speech from text using the Gemini API.
    This function uses the API key loaded from Hugging Face secrets.
    """
    # 1. Validate Inputs (API Key and Text)
    if not GOOGLE_API_KEY:
        raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
    if not text or not text.strip():
        raise gr.Error("Please enter some text to synthesize.")

    try:
        # 2. Configure the Gemini API with the loaded key
        genai.configure(api_key=GOOGLE_API_KEY)

        # 3. Call the Text-to-Speech Model
        # We use the 'tts-1' model which is optimized for this task.
        model = genai.GenerativeModel(model_name='tts-1')
        
        # The API can be instructed on tone and style directly in the prompt.
        prompt = f"Speak the following text in a cheerful and friendly voice: '{text}'"
        
        # The tts-1 model implicitly returns audio/wav format.
        response = model.generate_content(prompt)
        
        # 4. Process the Response and Save the Audio File
        # The audio data is conveniently located in the `audio_content` attribute.
        if response.audio_content:
            audio_file_path = create_unique_wav_file(response.audio_content)
            return audio_file_path
        else:
            # Handle cases where audio might not be generated
            raise gr.Error("The API did not return audio data. Please check your text or try again.")

    except Exception as e:
        # Provide a more informative error message in the UI.
        print(f"An error occurred: {e}")
        raise gr.Error(f"Failed to synthesize speech. Please check your network connection and that your API key is valid. Error: {e}")

# --- Gradio User Interface ---
with gr.Blocks(theme=gr.themes.Soft()) as iface:
    gr.Markdown(
        """
        # ✨ Gemini Text-to-Speech Synthesizer
        This app uses an API key stored securely in Hugging Face secrets. 
        Just enter the text you want to convert to speech!
        """
    )
    
    # Input for the text to be synthesized.
    text_input = gr.Textbox(
        label="Text to Synthesize",
        placeholder="Hello! Welcome to the text-to-speech demonstration.",
        lines=4,
    )
    
    # Button to trigger the synthesis process.
    submit_btn = gr.Button("Generate Speech", variant="primary")
    
    # Component to display the generated audio.
    audio_output = gr.Audio(label="Generated Audio", type="filepath")

    # Connect the button click event to the core function.
    # The API key is now handled internally and not needed as an input.
    submit_btn.click(
        fn=synthesize_speech,
        inputs=[text_input],
        outputs=audio_output
    )
    
    # Provide example text for users to try easily.
    gr.Examples(
        examples=[
            "The weather is wonderful today, perfect for a walk in the park.",
            "I am so excited to try out this new text-to-speech feature!",
            "Congratulations on your amazing achievement!",
            "This is a demonstration of high-quality speech synthesis."
        ],
        inputs=[text_input],
        label="Example Prompts"
    )

# --- Main execution block ---
# To deploy, push this file and a requirements.txt to a Hugging Face Space
# and set the GOOGLE_API_KEY in the repository secrets.
if __name__ == "__main__":
    iface.launch()