Athspi commited on
Commit
b1483f2
·
verified ·
1 Parent(s): d66ff40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -54
app.py CHANGED
@@ -3,26 +3,23 @@ import asyncio
3
  import base64
4
  import io
5
  import cv2
6
- import sounddevice as sd
7
  import numpy as np
8
  import PIL.Image
9
  import mss
10
  from google import genai
11
  from google.genai import types
 
 
12
  import soundfile as sf
13
 
14
  # Configuration
15
  SAMPLE_RATE = 24000
16
- CHUNK_SIZE = 1024
17
  MODEL = "models/gemini-2.0-flash-exp"
18
 
19
  class GeminiTTS:
20
  def __init__(self, api_key):
21
  self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
22
- self.audio_in_queue = asyncio.Queue()
23
- self.out_queue = asyncio.Queue(maxsize=5)
24
  self.session = None
25
- self.audio_stream = None
26
 
27
  self.config = types.LiveConnectConfig(
28
  response_modalities=["audio"],
@@ -59,21 +56,6 @@ class GeminiTTS:
59
  image_io.seek(0)
60
  return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
61
 
62
- async def record_audio(self, duration=5):
63
- """Record audio using sounddevice"""
64
- print(f"Recording for {duration} seconds...")
65
- recording = sd.rec(int(duration * SAMPLE_RATE),
66
- samplerate=SAMPLE_RATE,
67
- channels=1,
68
- dtype='float32')
69
- sd.wait() # Wait until recording is finished
70
- return recording
71
-
72
- async def play_audio(self, audio_data):
73
- """Play audio using sounddevice"""
74
- sd.play(audio_data, samplerate=SAMPLE_RATE)
75
- sd.wait() # Wait until playback is finished
76
-
77
  async def process_input(self, text=None, mode="text"):
78
  try:
79
  async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
@@ -96,11 +78,12 @@ class GeminiTTS:
96
  turn = session.receive()
97
  async for response in turn:
98
  if data := response.data:
99
- # Save audio to buffer
 
100
  with io.BytesIO() as wav_buffer:
101
- sf.write(wav_buffer, data, SAMPLE_RATE, format='WAV')
102
- wav_buffer.seek(0)
103
- return (SAMPLE_RATE, wav_buffer.read())
104
  if text := response.text:
105
  return text
106
 
@@ -119,30 +102,10 @@ def create_gradio_interface():
119
  async def generate_response(text, mode):
120
  if not tts_handler:
121
  raise gr.Error("Please initialize the TTS system first with your API key")
122
-
123
- result = await tts_handler.process_input(text, mode)
124
-
125
- if isinstance(result, tuple) and len(result) == 2:
126
- # Audio response (sample_rate, audio_data)
127
- return result
128
- else:
129
- # Text response
130
- return result
131
-
132
- async def record_and_process():
133
- if not tts_handler:
134
- raise gr.Error("Please initialize the TTS system first with your API key")
135
-
136
- # Record audio
137
- recording = await tts_handler.record_audio(duration=5)
138
-
139
- # Process audio (you would need to implement this part)
140
- # For now, we'll just play it back
141
- await tts_handler.play_audio(recording)
142
- return (SAMPLE_RATE, recording.tobytes())
143
 
144
  with gr.Blocks(title="Gemini TTS Interface") as demo:
145
- gr.Markdown("# 🎤 Gemini Text-to-Speech Interface with SoundDevice")
146
 
147
  with gr.Row():
148
  api_key = gr.Textbox(label="Gemini API Key", type="password")
@@ -157,22 +120,29 @@ def create_gradio_interface():
157
  text_btn = gr.Button("Generate Speech")
158
 
159
  text_output = gr.Audio(label="Generated Speech")
160
- text_btn.click(generate_response, inputs=[text_input, gr.Text("text", visible=False)], outputs=text_output)
161
-
162
- with gr.Tab("Voice Input"):
163
- record_btn = gr.Button("Record and Process (5 sec)")
164
- voice_output = gr.Audio(label="Processed Audio")
165
- record_btn.click(record_and_process, outputs=voice_output)
166
 
167
  with gr.Tab("Camera Input"):
168
  camera_btn = gr.Button("Capture and Process")
169
  camera_output = gr.Audio(label="Generated Speech from Camera")
170
- camera_btn.click(generate_response, inputs=[gr.Text("", visible=False), gr.Text("camera", visible=False)], outputs=camera_output)
 
 
 
 
171
 
172
  with gr.Tab("Screen Capture"):
173
  screen_btn = gr.Button("Capture Screen and Process")
174
  screen_output = gr.Audio(label="Generated Speech from Screen")
175
- screen_btn.click(generate_response, inputs=[gr.Text("", visible=False), gr.Text("screen", visible=False)], outputs=screen_output)
 
 
 
 
176
 
177
  return demo
178
 
 
3
  import base64
4
  import io
5
  import cv2
 
6
  import numpy as np
7
  import PIL.Image
8
  import mss
9
  from google import genai
10
  from google.genai import types
11
+ from pydub import AudioSegment
12
+ from pydub.playback import play
13
  import soundfile as sf
14
 
15
  # Configuration
16
  SAMPLE_RATE = 24000
 
17
  MODEL = "models/gemini-2.0-flash-exp"
18
 
19
  class GeminiTTS:
20
  def __init__(self, api_key):
21
  self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
 
 
22
  self.session = None
 
23
 
24
  self.config = types.LiveConnectConfig(
25
  response_modalities=["audio"],
 
56
  image_io.seek(0)
57
  return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  async def process_input(self, text=None, mode="text"):
60
  try:
61
  async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
 
78
  turn = session.receive()
79
  async for response in turn:
80
  if data := response.data:
81
+ # Convert to playable audio format
82
+ audio_array = np.frombuffer(data, dtype=np.float32)
83
  with io.BytesIO() as wav_buffer:
84
+ sf.write(wav_buffer, audio_array, SAMPLE_RATE, format='WAV')
85
+ wav_bytes = wav_buffer.getvalue()
86
+ return (SAMPLE_RATE, wav_bytes)
87
  if text := response.text:
88
  return text
89
 
 
102
  async def generate_response(text, mode):
103
  if not tts_handler:
104
  raise gr.Error("Please initialize the TTS system first with your API key")
105
+ return await tts_handler.process_input(text, mode)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  with gr.Blocks(title="Gemini TTS Interface") as demo:
108
+ gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")
109
 
110
  with gr.Row():
111
  api_key = gr.Textbox(label="Gemini API Key", type="password")
 
120
  text_btn = gr.Button("Generate Speech")
121
 
122
  text_output = gr.Audio(label="Generated Speech")
123
+ text_btn.click(
124
+ generate_response,
125
+ inputs=[text_input, gr.Text("text", visible=False)],
126
+ outputs=text_output
127
+ )
 
128
 
129
  with gr.Tab("Camera Input"):
130
  camera_btn = gr.Button("Capture and Process")
131
  camera_output = gr.Audio(label="Generated Speech from Camera")
132
+ camera_btn.click(
133
+ generate_response,
134
+ inputs=[gr.Text("", visible=False), gr.Text("camera", visible=False)],
135
+ outputs=camera_output
136
+ )
137
 
138
  with gr.Tab("Screen Capture"):
139
  screen_btn = gr.Button("Capture Screen and Process")
140
  screen_output = gr.Audio(label="Generated Speech from Screen")
141
+ screen_btn.click(
142
+ generate_response,
143
+ inputs=[gr.Text("", visible=False), gr.Text("screen", visible=False)],
144
+ outputs=screen_output
145
+ )
146
 
147
  return demo
148