Athspi commited on
Commit
7b02fdc
·
verified ·
1 Parent(s): 6aa8d7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -100
app.py CHANGED
@@ -1,104 +1,71 @@
1
  import os
2
- import requests
3
  import wave
4
- import base64
5
- from fastapi import FastAPI, HTTPException
6
- from fastapi.responses import FileResponse
7
- from pydantic import BaseModel
8
- from dotenv import load_dotenv
9
-
10
- # Load API key
11
- load_dotenv()
12
- API_KEY = os.getenv("GEMINI_API_KEY")
13
- if not API_KEY:
14
- raise RuntimeError("Missing GEMINI_API_KEY in environment")
15
-
16
- BASE_URL = (
17
- "https://generativelanguage.googleapis.com/"
18
- "v1beta/models/gemini-2.5-flash-preview-tts:"
19
- "generateContent"
20
- f"?key={API_KEY}"
21
- )
22
-
23
- app = FastAPI(title="Gemini TTS JSON API")
24
-
25
- def save_wav(path: str, pcm: bytes, channels=1, rate=24000, width=2):
26
- with wave.open(path, "wb") as wf:
27
  wf.setnchannels(channels)
28
- wf.setsampwidth(width)
29
  wf.setframerate(rate)
30
- wf.writeframes(pcm)
31
-
32
- class SingleTTSRequest(BaseModel):
33
- prompt: str
34
- voice_name: str
35
-
36
- class MultiTTSRequest(BaseModel):
37
- prompt: str
38
- speaker1: str
39
- voice1: str
40
- speaker2: str
41
- voice2: str
42
-
43
- @app.get("/")
44
- def health():
45
- return {"status": "Gemini TTS JSON API up and running!"}
46
-
47
- @app.post("/single_tts")
48
- def single_tts(req: SingleTTSRequest):
49
- payload = {
50
- "model": "gemini-2.5-flash-preview-tts",
51
- "contents": [{"parts": [{"text": req.prompt}]}],
52
- "config": {
53
- "responseModalities": ["AUDIO"],
54
- "speechConfig": {
55
- "voiceConfig": {
56
- "prebuiltVoiceConfig": {"voiceName": req.voice_name}
57
- }
58
- }
59
- }
60
- }
61
-
62
- resp = requests.post(BASE_URL, json=payload)
63
- if resp.status_code != 200:
64
- raise HTTPException(status_code=resp.status_code, detail=resp.json())
65
-
66
- b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
67
- pcm = base64.b64decode(b64)
68
- out = "single_output.wav"
69
- save_wav(out, pcm)
70
- return FileResponse(out, media_type="audio/wav", filename=out)
71
-
72
- @app.post("/multi_tts")
73
- def multi_tts(req: MultiTTSRequest):
74
- payload = {
75
- "model": "gemini-2.5-flash-preview-tts",
76
- "contents": [{"parts": [{"text": req.prompt}]}],
77
- "config": {
78
- "responseModalities": ["AUDIO"],
79
- "speechConfig": {
80
- "multiSpeakerVoiceConfig": {
81
- "speakerVoiceConfigs": [
82
- {
83
- "speaker": req.speaker1,
84
- "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": req.voice1}}
85
- },
86
- {
87
- "speaker": req.speaker2,
88
- "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": req.voice2}}
89
- }
90
- ]
91
- }
92
- }
93
- }
94
- }
95
-
96
- resp = requests.post(BASE_URL, json=payload)
97
- if resp.status_code != 200:
98
- raise HTTPException(status_code=resp.status_code, detail=resp.json())
99
-
100
- b64 = resp.json()["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]
101
- pcm = base64.b64decode(b64)
102
- out = "multi_output.wav"
103
- save_wav(out, pcm)
104
- return FileResponse(out, media_type="audio/wav", filename=out)
 
1
  import os
 
2
  import wave
3
+ import gradio as gr
4
+ import google.generativeai as genai
5
+
6
+ # Set your API Key (or via Hugging Face Secrets / os.environ)
7
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
8
+
9
+ if not GOOGLE_API_KEY:
10
+ raise ValueError("Please set your GOOGLE_API_KEY environment variable.")
11
+
12
+ # Configure Generative AI
13
+ genai.configure(api_key=GOOGLE_API_KEY)
14
+
15
+ # Initialize Gemini TTS model
16
+ model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-tts")
17
+
18
+ # Function to save raw PCM data to WAV file
19
+ def save_wave(filename, pcm_data, channels=1, rate=24000, sample_width=2):
20
+ with wave.open(filename, 'wb') as wf:
 
 
 
 
 
21
  wf.setnchannels(channels)
22
+ wf.setsampwidth(sample_width)
23
  wf.setframerate(rate)
24
+ wf.writeframes(pcm_data)
25
+
26
+ # Function to handle TTS generation
27
+ def generate_tts(text):
28
+ if not text.strip():
29
+ return None, "Please enter some text."
30
+
31
+ try:
32
+ response = model.generate_content(
33
+ text,
34
+ generation_config={"response_mime_type": "audio/wav"},
35
+ response_modality="AUDIO"
36
+ )
37
+
38
+ # Extract audio data from response
39
+ audio_data = response.candidates[0].content.parts[0].inline_data.data
40
+
41
+ output_filename = "output.wav"
42
+ save_wave(output_filename, audio_data)
43
+
44
+ return output_filename, "Audio generated successfully!"
45
+
46
+ except Exception as e:
47
+ return None, f"Error: {str(e)}"
48
+
49
+ # Gradio Interface
50
+ with gr.Blocks() as demo:
51
+ gr.Markdown("## 🎙️ Gemini 2.5 Text-to-Speech Demo")
52
+
53
+ with gr.Row():
54
+ text_input = gr.Textbox(label="Enter text to convert to speech")
55
+
56
+ with gr.Row():
57
+ submit_button = gr.Button("Generate Speech")
58
+
59
+ with gr.Row():
60
+ audio_output = gr.Audio(label="Generated Audio", type="filepath")
61
+ status_output = gr.Textbox(label="Status")
62
+
63
+ submit_button.click(
64
+ fn=generate_tts,
65
+ inputs=[text_input],
66
+ outputs=[audio_output, status_output]
67
+ )
68
+
69
+ # Launch Gradio app
70
+ if __name__ == "__main__":
71
+ demo.launch()