File size: 2,952 Bytes
a09df36
94de3c6
a09df36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94de3c6
a09df36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6dde081
a09df36
 
22004d7
a09df36
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import wave
from fastapi import FastAPI, Response, HTTPException
from pydantic import BaseModel
import google.generativeai as genai
from google.generativeai import types

# --- Configuration and API Key ---
# It is recommended to set your Google API key as a secret in your Hugging Face Space settings.
# The key for the secret should be 'GOOGLE_API_KEY'.
try:
    GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
    if not GOOGLE_API_KEY:
        raise ValueError("Google API key not found. Please set it in your Hugging Face Space secrets.")
    genai.configure(api_key=GOOGLE_API_KEY)
except Exception as e:
    # This will help in debugging if the key is not set.
    print(f"Error during API key configuration: {e}")


# --- Pydantic Model for Request Body ---
class TextToSpeechRequest(BaseModel):
    text: str = "Say cheerfully: Have a wonderful day!"
    voice_name: str = "Kore"
    output_filename: str = "output.wav"


# --- FastAPI App Initialization ---
app = FastAPI()


@app.get("/")
def read_root():
    return {"message": "Welcome to the Text-to-Speech API using Gemini. Use the /generate-audio/ endpoint to create audio."}


@app.post("/generate-audio/")
async def generate_audio(request: TextToSpeechRequest):
    """
    This endpoint generates audio from the provided text using Google's Gemini model.
    """
    try:
        # --- Text-to-Speech Generation ---
        response = genai.generate_text(
            model="gemini-2.5-flash-preview-tts",
            prompt=request.text,
            options=types.GenerationOptions(
                response_modalities=["AUDIO"],
                speech_config=types.SpeechConfig(
                    voice_config=types.VoiceConfig(
                        prebuilt_voice_config=types.PrebuiltVoiceConfig(
                            voice_name=request.voice_name,
                        )
                    )
                ),
            )
        )

        # --- Extract Audio Data ---
        if not response.candidates or not response.candidates[0].content.parts or not response.candidates[0].content.parts[0].inline_data.data:
            raise HTTPException(status_code=500, detail="Audio data could not be generated.")

        audio_data = response.candidates[0].content.parts[0].inline_data.data

        # --- Save to a WAV file in memory ---
        import io
        buffer = io.BytesIO()
        with wave.open(buffer, "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(24000)
            wf.writeframes(audio_data)
        buffer.seek(0)

        # --- Return Audio File as Response ---
        return Response(content=buffer.getvalue(), media_type="audio/wav", headers={"Content-Disposition": f"attachment; filename={request.output_filename}"})

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# To run this locally, use the command: uvicorn app:app --reload