File size: 7,025 Bytes
740846d
b8a34b4
740846d
 
 
cb63aa0
 
740846d
 
b8a34b4
 
cb63aa0
bdfd7a5
740846d
cb63aa0
740846d
 
bdfd7a5
740846d
 
 
 
 
 
 
 
 
43ac355
 
 
8bdf1fa
43ac355
 
 
740846d
43ac355
 
 
740846d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb63aa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740846d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb63aa0
 
 
 
 
740846d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb63aa0
 
 
740846d
 
 
 
cb63aa0
 
 
 
 
 
 
 
 
 
 
 
740846d
cb63aa0
740846d
 
 
 
 
 
 
43ac355
740846d
 
 
 
 
 
 
43ac355
cb63aa0
 
 
 
 
740846d
 
 
 
43ac355
740846d
 
 
 
b8a34b4
740846d
bdfd7a5
5f3d5cb
740846d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import gradio as gr
import asyncio
import base64
import io
import cv2
import sounddevice as sd
import numpy as np
import PIL.Image
import mss
from google import genai
from google.genai import types
import soundfile as sf

# Configuration
SAMPLE_RATE = 24000
CHUNK_SIZE = 1024
MODEL = "models/gemini-2.0-flash-exp"

class GeminiTTS:
    def __init__(self, api_key):
        self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
        self.audio_in_queue = asyncio.Queue()
        self.out_queue = asyncio.Queue(maxsize=5)
        self.session = None
        self.audio_stream = None
        
        self.config = types.LiveConnectConfig(
            response_modalities=["audio"],
            speech_config=types.SpeechConfig(
                voice_config=types.VoiceConfig(
                    prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
                )
            ),
            system_instruction=types.Content(
                parts=[types.Part.from_text(text="Answer user ask, replay same thing user say no other word explain")],
                role="user"
            ),
        )

    async def _get_frame(self, cap):
        ret, frame = cap.read()
        if not ret:
            return None
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img = PIL.Image.fromarray(frame_rgb)
        img.thumbnail([1024, 1024])
        image_io = io.BytesIO()
        img.save(image_io, format="jpeg")
        image_io.seek(0)
        return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}

    async def _get_screen(self):
        sct = mss.mss()
        monitor = sct.monitors[0]
        i = sct.grab(monitor)
        img = PIL.Image.open(io.BytesIO(mss.tools.to_png(i.rgb, i.size)))
        image_io = io.BytesIO()
        img.save(image_io, format="jpeg")
        image_io.seek(0)
        return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}

    async def record_audio(self, duration=5):
        """Record audio using sounddevice"""
        print(f"Recording for {duration} seconds...")
        recording = sd.rec(int(duration * SAMPLE_RATE), 
                          samplerate=SAMPLE_RATE, 
                          channels=1, 
                          dtype='float32')
        sd.wait()  # Wait until recording is finished
        return recording

    async def play_audio(self, audio_data):
        """Play audio using sounddevice"""
        sd.play(audio_data, samplerate=SAMPLE_RATE)
        sd.wait()  # Wait until playback is finished

    async def process_input(self, text=None, mode="text"):
        try:
            async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
                self.session = session
                
                if mode == "text" and text:
                    await session.send(input=text or ".", end_of_turn=True)
                elif mode == "camera":
                    cap = cv2.VideoCapture(0)
                    frame = await self._get_frame(cap)
                    cap.release()
                    if frame:
                        await session.send(input=frame)
                elif mode == "screen":
                    frame = await self._get_screen()
                    if frame:
                        await session.send(input=frame)
                
                # Get response
                turn = session.receive()
                async for response in turn:
                    if data := response.data:
                        # Save audio to buffer
                        with io.BytesIO() as wav_buffer:
                            sf.write(wav_buffer, data, SAMPLE_RATE, format='WAV')
                            wav_buffer.seek(0)
                            return (SAMPLE_RATE, wav_buffer.read())
                    if text := response.text:
                        return text
                
                return "No response received"
        except Exception as e:
            return f"Error: {str(e)}"

def create_gradio_interface():
    tts_handler = None
    
    def init_tts(api_key):
        nonlocal tts_handler
        tts_handler = GeminiTTS(api_key)
        return "Gemini TTS Initialized!"
    
    async def generate_response(text, mode):
        if not tts_handler:
            raise gr.Error("Please initialize the TTS system first with your API key")
        
        result = await tts_handler.process_input(text, mode)
        
        if isinstance(result, tuple) and len(result) == 2:
            # Audio response (sample_rate, audio_data)
            return result
        else:
            # Text response
            return result

    async def record_and_process():
        if not tts_handler:
            raise gr.Error("Please initialize the TTS system first with your API key")
        
        # Record audio
        recording = await tts_handler.record_audio(duration=5)
        
        # Process audio (you would need to implement this part)
        # For now, we'll just play it back
        await tts_handler.play_audio(recording)
        return (SAMPLE_RATE, recording.tobytes())

    with gr.Blocks(title="Gemini TTS Interface") as demo:
        gr.Markdown("# 🎤 Gemini Text-to-Speech Interface with SoundDevice")
        
        with gr.Row():
            api_key = gr.Textbox(label="Gemini API Key", type="password")
            init_btn = gr.Button("Initialize TTS")
        
        init_output = gr.Textbox(label="Initialization Status", interactive=False)
        init_btn.click(init_tts, inputs=api_key, outputs=init_output)
        
        with gr.Tab("Text Input"):
            with gr.Row():
                text_input = gr.Textbox(label="Enter Text", lines=3)
                text_btn = gr.Button("Generate Speech")
            
            text_output = gr.Audio(label="Generated Speech")
            text_btn.click(generate_response, inputs=[text_input, gr.Text("text", visible=False)], outputs=text_output)
        
        with gr.Tab("Voice Input"):
            record_btn = gr.Button("Record and Process (5 sec)")
            voice_output = gr.Audio(label="Processed Audio")
            record_btn.click(record_and_process, outputs=voice_output)
        
        with gr.Tab("Camera Input"):
            camera_btn = gr.Button("Capture and Process")
            camera_output = gr.Audio(label="Generated Speech from Camera")
            camera_btn.click(generate_response, inputs=[gr.Text("", visible=False), gr.Text("camera", visible=False)], outputs=camera_output)
        
        with gr.Tab("Screen Capture"):
            screen_btn = gr.Button("Capture Screen and Process")
            screen_output = gr.Audio(label="Generated Speech from Screen")
            screen_btn.click(generate_response, inputs=[gr.Text("", visible=False), gr.Text("screen", visible=False)], outputs=screen_output)
    
    return demo

if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch()