Athspi commited on
Commit
5011794
·
verified ·
1 Parent(s): 839126d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -23
app.py CHANGED
@@ -1,38 +1,83 @@
1
- from fastapi import FastAPI, Form
2
- from fastapi.responses import FileResponse
3
- import google.generativeai as genai
4
- import wave
5
  import os
 
 
 
 
6
 
7
- app = FastAPI()
8
 
9
- GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 
 
10
 
11
- if not GOOGLE_API_KEY:
12
- raise ValueError("Set GOOGLE_API_KEY environment variable.")
13
 
14
- genai.configure(api_key=GOOGLE_API_KEY)
15
 
16
- model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-tts")
 
 
17
 
18
- def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
19
  with wave.open(filename, "wb") as wf:
20
  wf.setnchannels(channels)
21
  wf.setsampwidth(sample_width)
22
  wf.setframerate(rate)
23
- wf.writeframes(pcm)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- @app.post("/tts")
26
- def tts(text: str = Form(...)):
27
- response = model.generate_content(
28
- text,
29
- generation_config={"response_mime_type": "audio/wav"},
30
- response_modality="AUDIO"
31
- )
32
 
33
- audio_data = response.candidates[0].content.parts[0].inline_data.data
34
 
35
- output_path = "out.wav"
36
- wave_file(output_path, audio_data)
37
 
38
- return FileResponse(output_path, media_type="audio/wav", filename="output.wav")
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import Dict
 
4
  import os
5
+ import requests
6
+ import wave
7
+ import base64
8
+ from dotenv import load_dotenv
9
 
10
+ load_dotenv()
11
 
12
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
13
+ if not GEMINI_API_KEY:
14
+ raise ValueError("Missing GEMINI_API_KEY")
15
 
16
+ API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent"
 
17
 
18
+ app = FastAPI()
19
 
20
+ class TTSRequest(BaseModel):
21
+ prompt: str
22
+ speakers: Dict[str, str] # e.g. { "Joe": "Kore", "Jane": "Puck" }
23
 
24
+ def wave_file(filename, pcm_data, channels=1, rate=24000, sample_width=2):
25
  with wave.open(filename, "wb") as wf:
26
  wf.setnchannels(channels)
27
  wf.setsampwidth(sample_width)
28
  wf.setframerate(rate)
29
+ wf.writeframes(pcm_data)
30
+
31
+ @app.post("/dynamic_multispeaker_tts")
32
+ def dynamic_multispeaker_tts(request: TTSRequest):
33
+ headers = {
34
+ "Content-Type": "application/json",
35
+ "Authorization": f"Bearer {GEMINI_API_KEY}"
36
+ }
37
+
38
+ # Build speaker voice config list dynamically
39
+ speaker_voice_configs = []
40
+ for speaker, voice_name in request.speakers.items():
41
+ speaker_voice_configs.append({
42
+ "speaker": speaker,
43
+ "voiceConfig": {
44
+ "prebuiltVoiceConfig": {
45
+ "voiceName": voice_name
46
+ }
47
+ }
48
+ })
49
+
50
+ payload = {
51
+ "contents": [{"parts": [{"text": request.prompt}]}],
52
+ "generationConfig": {
53
+ "responseMimeType": "audio/wav"
54
+ },
55
+ "responseModality": "AUDIO",
56
+ "speechConfig": {
57
+ "multiSpeakerVoiceConfig": {
58
+ "speakerVoiceConfigs": speaker_voice_configs
59
+ }
60
+ }
61
+ }
62
+
63
+ response = requests.post(API_URL, headers=headers, json=payload)
64
+
65
+ if response.status_code != 200:
66
+ raise HTTPException(status_code=500, detail=response.text)
67
+
68
+ result = response.json()
69
+ try:
70
+ audio_data_b64 = result["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
71
+ audio_data = base64.b64decode(audio_data_b64)
72
 
73
+ output_path = "output.wav"
74
+ wave_file(output_path, audio_data)
 
 
 
 
 
75
 
76
+ return {"message": "Audio generated successfully. Download 'output.wav'."}
77
 
78
+ except Exception as e:
79
+ raise HTTPException(status_code=500, detail=str(e))
80
 
81
+ @app.get("/")
82
+ def root():
83
+ return {"message": "Gemini Multi-Speaker TTS is running!"}