File size: 5,650 Bytes
740846d b8a34b4 740846d cb63aa0 740846d b8a34b4 b1483f2 cb63aa0 bdfd7a5 740846d cb63aa0 740846d bdfd7a5 740846d 43ac355 8bdf1fa 43ac355 740846d 43ac355 740846d b1483f2 cb63aa0 b1483f2 740846d b1483f2 cb63aa0 740846d b1483f2 740846d 43ac355 740846d b1483f2 cb63aa0 740846d b1483f2 43ac355 740846d b1483f2 b8a34b4 740846d bdfd7a5 5f3d5cb 740846d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import gradio as gr
import asyncio
import base64
import io
import cv2
import numpy as np
import PIL.Image
import mss
from google import genai
from google.genai import types
from pydub import AudioSegment
from pydub.playback import play
import soundfile as sf
# Configuration
SAMPLE_RATE = 24000
MODEL = "models/gemini-2.0-flash-exp"
class GeminiTTS:
def __init__(self, api_key):
self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
self.session = None
self.config = types.LiveConnectConfig(
response_modalities=["audio"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
)
),
system_instruction=types.Content(
parts=[types.Part.from_text(text="Answer user ask, replay same thing user say no other word explain")],
role="user"
),
)
async def _get_frame(self, cap):
ret, frame = cap.read()
if not ret:
return None
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
img = PIL.Image.fromarray(frame_rgb)
img.thumbnail([1024, 1024])
image_io = io.BytesIO()
img.save(image_io, format="jpeg")
image_io.seek(0)
return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
async def _get_screen(self):
sct = mss.mss()
monitor = sct.monitors[0]
i = sct.grab(monitor)
img = PIL.Image.open(io.BytesIO(mss.tools.to_png(i.rgb, i.size)))
image_io = io.BytesIO()
img.save(image_io, format="jpeg")
image_io.seek(0)
return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}
async def process_input(self, text=None, mode="text"):
try:
async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
self.session = session
if mode == "text" and text:
await session.send(input=text or ".", end_of_turn=True)
elif mode == "camera":
cap = cv2.VideoCapture(0)
frame = await self._get_frame(cap)
cap.release()
if frame:
await session.send(input=frame)
elif mode == "screen":
frame = await self._get_screen()
if frame:
await session.send(input=frame)
# Get response
turn = session.receive()
async for response in turn:
if data := response.data:
# Convert to playable audio format
audio_array = np.frombuffer(data, dtype=np.float32)
with io.BytesIO() as wav_buffer:
sf.write(wav_buffer, audio_array, SAMPLE_RATE, format='WAV')
wav_bytes = wav_buffer.getvalue()
return (SAMPLE_RATE, wav_bytes)
if text := response.text:
return text
return "No response received"
except Exception as e:
return f"Error: {str(e)}"
def create_gradio_interface():
tts_handler = None
def init_tts(api_key):
nonlocal tts_handler
tts_handler = GeminiTTS(api_key)
return "Gemini TTS Initialized!"
async def generate_response(text, mode):
if not tts_handler:
raise gr.Error("Please initialize the TTS system first with your API key")
return await tts_handler.process_input(text, mode)
with gr.Blocks(title="Gemini TTS Interface") as demo:
gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")
with gr.Row():
api_key = gr.Textbox(label="Gemini API Key", type="password")
init_btn = gr.Button("Initialize TTS")
init_output = gr.Textbox(label="Initialization Status", interactive=False)
init_btn.click(init_tts, inputs=api_key, outputs=init_output)
with gr.Tab("Text Input"):
with gr.Row():
text_input = gr.Textbox(label="Enter Text", lines=3)
text_btn = gr.Button("Generate Speech")
text_output = gr.Audio(label="Generated Speech")
text_btn.click(
generate_response,
inputs=[text_input, gr.Text("text", visible=False)],
outputs=text_output
)
with gr.Tab("Camera Input"):
camera_btn = gr.Button("Capture and Process")
camera_output = gr.Audio(label="Generated Speech from Camera")
camera_btn.click(
generate_response,
inputs=[gr.Text("", visible=False), gr.Text("camera", visible=False)],
outputs=camera_output
)
with gr.Tab("Screen Capture"):
screen_btn = gr.Button("Capture Screen and Process")
screen_output = gr.Audio(label="Generated Speech from Screen")
screen_btn.click(
generate_response,
inputs=[gr.Text("", visible=False), gr.Text("screen", visible=False)],
outputs=screen_output
)
return demo
if __name__ == "__main__":
demo = create_gradio_interface()
demo.launch() |