Athspi commited on
Commit
4a54590
·
verified ·
1 Parent(s): 78bb362

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -52
app.py CHANGED
@@ -1,81 +1,66 @@
1
- import os
2
- import wave
3
- from fastapi import FastAPI, Response, HTTPException
4
  from pydantic import BaseModel
5
- import google.generativeai as genai
6
  from google.generativeai import types
 
 
7
 
8
- # --- Configuration and API Key ---
9
- # It is recommended to set your Google API key as a secret in your Hugging Face Space settings.
10
- # The key for the secret should be 'GOOGLE_API_KEY'.
11
- try:
12
- GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
13
- if not GOOGLE_API_KEY:
14
- raise ValueError("Google API key not found. Please set it in your Hugging Face Space secrets.")
15
- genai.configure(api_key=GOOGLE_API_KEY)
16
- except Exception as e:
17
- # This will help in debugging if the key is not set.
18
- print(f"Error during API key configuration: {e}")
19
 
 
 
20
 
21
- # --- Pydantic Model for Request Body ---
22
- class TextToSpeechRequest(BaseModel):
23
- text: str = "Say cheerfully: Have a wonderful day!"
24
- voice_name: str = "Kore"
25
- output_filename: str = "output.wav"
26
 
 
 
27
 
28
- # --- FastAPI App Initialization ---
29
- app = FastAPI()
 
 
 
 
 
 
30
 
31
 
 
32
  @app.get("/")
33
  def read_root():
34
- return {"message": "Welcome to the Text-to-Speech API using Gemini. Use the /generate-audio/ endpoint to create audio."}
35
 
36
 
37
- @app.post("/generate-audio/")
38
- async def generate_audio(request: TextToSpeechRequest):
39
- """
40
- This endpoint generates audio from the provided text using Google's Gemini model.
41
- """
42
  try:
43
- # --- Text-to-Speech Generation ---
44
- response = genai.generate_text(
45
  model="gemini-2.5-flash-preview-tts",
46
- prompt=request.text,
47
- options=types.GenerationOptions(
48
  response_modalities=["AUDIO"],
49
  speech_config=types.SpeechConfig(
50
  voice_config=types.VoiceConfig(
51
  prebuilt_voice_config=types.PrebuiltVoiceConfig(
52
- voice_name=request.voice_name,
53
  )
54
  )
55
  ),
56
  )
57
  )
58
 
59
- # --- Extract Audio Data ---
60
- if not response.candidates or not response.candidates[0].content.parts or not response.candidates[0].content.parts[0].inline_data.data:
61
- raise HTTPException(status_code=500, detail="Audio data could not be generated.")
62
 
63
- audio_data = response.candidates[0].content.parts[0].inline_data.data
 
 
64
 
65
- # --- Save to a WAV file in memory ---
66
- import io
67
- buffer = io.BytesIO()
68
- with wave.open(buffer, "wb") as wf:
69
- wf.setnchannels(1)
70
- wf.setsampwidth(2)
71
- wf.setframerate(24000)
72
- wf.writeframes(audio_data)
73
- buffer.seek(0)
74
-
75
- # --- Return Audio File as Response ---
76
- return Response(content=buffer.getvalue(), media_type="audio/wav", headers={"Content-Disposition": f"attachment; filename={request.output_filename}"})
77
 
78
  except Exception as e:
79
- raise HTTPException(status_code=500, detail=str(e))
80
-
81
- # To run this locally, use the command: uvicorn app:app --reload
 
1
+ from fastapi import FastAPI, Form
2
+ from fastapi.responses import FileResponse, JSONResponse
 
3
  from pydantic import BaseModel
4
+ from google import generativeai as genai
5
  from google.generativeai import types
6
+ import wave
7
+ import os
8
 
9
+ app = FastAPI()
 
 
 
 
 
 
 
 
 
 
10
 
11
+ # Set your Google API key via environment variable for Spaces safety
12
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
13
 
14
+ if not GOOGLE_API_KEY:
15
+ raise ValueError("GOOGLE_API_KEY environment variable not set.")
 
 
 
16
 
17
+ # Initialize the GenAI client
18
+ client = genai.Client(api_key=GOOGLE_API_KEY)
19
 
20
+
21
+ # Function to write PCM audio data to WAV file
22
+ def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
23
+ with wave.open(filename, "wb") as wf:
24
+ wf.setnchannels(channels)
25
+ wf.setsampwidth(sample_width)
26
+ wf.setframerate(rate)
27
+ wf.writeframes(pcm)
28
 
29
 
30
+ # Root endpoint to check API is running
31
  @app.get("/")
32
  def read_root():
33
+ return {"message": "🦜 Gemini TTS FastAPI running on Hugging Face Spaces!"}
34
 
35
 
36
+ # POST endpoint to generate TTS audio from text
37
+ @app.post("/generate-audio")
38
+ def generate_audio(text: str = Form(...)):
 
 
39
  try:
40
+ response = client.models.generate_content(
 
41
  model="gemini-2.5-flash-preview-tts",
42
+ contents=text,
43
+ config=types.GenerateContentConfig(
44
  response_modalities=["AUDIO"],
45
  speech_config=types.SpeechConfig(
46
  voice_config=types.VoiceConfig(
47
  prebuilt_voice_config=types.PrebuiltVoiceConfig(
48
+ voice_name="Kore"
49
  )
50
  )
51
  ),
52
  )
53
  )
54
 
55
+ # Get audio data
56
+ data = response.candidates[0].content.parts[0].inline_data.data
 
57
 
58
+ # Save to file
59
+ file_name = "output.wav"
60
+ wave_file(file_name, data)
61
 
62
+ # Return the file as response
63
+ return FileResponse(file_name, media_type="audio/wav", filename="output.wav")
 
 
 
 
 
 
 
 
 
 
64
 
65
  except Exception as e:
66
+ return JSONResponse(content={"error": str(e)}, status_code=500)