arthrod commited on
Commit
b142be4
Β·
verified Β·
1 Parent(s): 922fa3c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +451 -168
app.py CHANGED
@@ -1,193 +1,476 @@
1
- import asyncio
2
- import base64
 
 
 
 
 
 
3
  import os
 
4
  import time
5
- from io import BytesIO
6
-
7
- import gradio as gr
8
  import numpy as np
9
- import websockets
10
- from dotenv import load_dotenv
11
- from fastrtc import (
12
- AsyncAudioVideoStreamHandler,
13
- Stream,
14
- WebRTC,
15
- get_cloudflare_turn_credentials_async,
16
- wait_for_item,
17
- )
18
  from google import genai
19
- from gradio.utils import get_space
20
- from PIL import Image
21
-
22
- load_dotenv()
23
-
24
-
25
- def encode_audio(data: np.ndarray) -> dict:
26
- """Encode Audio data to send to the server"""
27
- return {
28
- "mime_type": "audio/pcm",
29
- "data": base64.b64encode(data.tobytes()).decode("UTF-8"),
30
- }
31
 
 
 
32
 
33
- def encode_image(data: np.ndarray) -> dict:
34
- with BytesIO() as output_bytes:
35
- pil_image = Image.fromarray(data)
36
- pil_image.save(output_bytes, "JPEG")
37
- bytes_data = output_bytes.getvalue()
38
- base64_str = str(base64.b64encode(bytes_data), "utf-8")
39
- return {"mime_type": "image/jpeg", "data": base64_str}
40
-
41
-
42
- class GeminiHandler(AsyncAudioVideoStreamHandler):
43
- def __init__(
44
- self,
45
- ) -> None:
46
  super().__init__(
47
- "mono",
48
- output_sample_rate=24000,
49
- input_sample_rate=16000,
50
  )
51
- self.audio_queue = asyncio.Queue()
52
- self.video_queue = asyncio.Queue()
53
  self.session = None
54
  self.last_frame_time = 0
55
- self.quit = asyncio.Event()
56
-
57
- def copy(self) -> "GeminiHandler":
58
- return GeminiHandler()
59
-
60
  async def start_up(self):
61
- client = genai.Client(
62
- api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
63
- )
64
- config = {"response_modalities": ["AUDIO"]}
65
- async with client.aio.live.connect(
66
- model="gemini-2.0-flash-exp",
67
- config=config, # type: ignore
68
- ) as session:
69
- self.session = session
70
- while not self.quit.is_set():
71
- turn = self.session.receive()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  try:
73
- async for response in turn:
74
- if data := response.data:
75
- audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
76
- self.audio_queue.put_nowait(audio)
77
- except websockets.exceptions.ConnectionClosedOK:
78
- print("connection closed")
79
- break
80
-
81
- async def video_receive(self, frame: np.ndarray):
82
- self.video_queue.put_nowait(frame)
83
-
84
- if self.session:
85
- # send image every 1 second
86
- print(time.time() - self.last_frame_time)
87
- if time.time() - self.last_frame_time > 1:
88
- self.last_frame_time = time.time()
89
- await self.session.send(input=encode_image(frame))
90
- if self.latest_args[1] is not None:
91
- await self.session.send(input=encode_image(self.latest_args[1]))
92
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  async def video_emit(self):
94
- frame = await wait_for_item(self.video_queue, 0.01)
95
- if frame is not None:
96
- return frame
97
- else:
98
- return np.zeros((100, 100, 3), dtype=np.uint8)
99
-
100
- async def receive(self, frame: tuple[int, np.ndarray]) -> None:
101
- _, array = frame
102
- array = array.squeeze()
103
- audio_message = encode_audio(array)
 
 
 
 
 
 
104
  if self.session:
105
- await self.session.send(input=audio_message)
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- async def emit(self):
108
- array = await wait_for_item(self.audio_queue, 0.01)
109
- if array is not None:
110
- return (self.output_sample_rate, array)
111
- return array
112
 
113
- async def shutdown(self) -> None:
114
- if self.session:
115
- self.quit.set()
116
- await self.session.close()
117
- self.quit.clear()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- stream = Stream(
121
- handler=GeminiHandler(),
122
- modality="audio-video",
123
- mode="send-receive",
124
- rtc_configuration=get_cloudflare_turn_credentials_async if get_space() else None,
125
- time_limit=180 if get_space() else None,
126
- additional_inputs=[
127
- gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
128
- ],
129
- ui_args={
130
- "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
131
- "pulse_color": "rgb(255, 255, 255)",
132
- "icon_button_color": "rgb(255, 255, 255)",
133
- "title": "Gemini Audio Video Chat",
134
- },
135
- )
136
 
137
- css = """
138
- #video-source {max-width: 600px !important; max-height: 600 !important;}
139
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- with gr.Blocks(css=css) as demo:
142
- gr.HTML(
143
- """
144
- <div style='display: flex; align-items: center; justify-content: center; gap: 20px'>
145
- <div style="background-color: var(--block-background-fill); border-radius: 8px">
146
- <img src="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png" style="width: 100px; height: 100px;">
147
- </div>
148
- <div>
149
- <h1>Gen AI SDK Voice Chat</h1>
150
- <p>Speak with Gemini using real-time audio + video streaming</p>
151
- <p>Powered by <a href="https://gradio.app/">Gradio</a> and <a href=https://freddyaboulton.github.io/gradio-webrtc/">WebRTC</a>⚑️</p>
152
- <p>Get an API Key <a href="https://support.google.com/googleapi/answer/6158862?hl=en">here</a></p>
153
- </div>
154
- </div>
155
- """
156
- )
157
- with gr.Row() as row:
158
- with gr.Column():
159
- webrtc = WebRTC(
160
- label="Video Chat",
161
- modality="audio-video",
162
- mode="send-receive",
163
- elem_id="video-source",
164
- rtc_configuration=get_cloudflare_turn_credentials_async
165
- if get_space()
166
- else None,
167
- icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
168
- pulse_color="rgb(255, 255, 255)",
169
- icon_button_color="rgb(255, 255, 255)",
170
- )
171
- with gr.Column():
172
- image_input = gr.Image(
173
- label="Image", type="numpy", sources=["upload", "clipboard"]
174
- )
175
 
176
- webrtc.stream(
177
- GeminiHandler(),
178
- inputs=[webrtc, image_input],
179
- outputs=[webrtc],
180
- time_limit=180 if get_space() else None,
181
- concurrency_limit=2 if get_space() else None,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- stream.ui = demo
185
-
186
-
187
  if __name__ == "__main__":
188
- if (mode := os.getenv("MODE")) == "UI":
189
- stream.ui.launch(server_port=7860)
190
- elif mode == "PHONE":
191
- raise ValueError("Phone mode not supported for this demo")
 
 
 
 
 
 
 
 
 
192
  else:
193
- stream.ui.launch(server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Real-Time Screen Assistant - Refactored for Google GenAI Live API + FastRTC
3
+
4
+ This application transforms the original screenshot analyzer into a real-time
5
+ screen sharing assistant with voice interaction, following the refactoring
6
+ instructions for live streaming capabilities.
7
+ """
8
+
9
  import os
10
+ import asyncio
11
  import time
 
 
 
12
  import numpy as np
13
+ import numpy.typing as npt
14
+ import cv2
15
+ import gradio as gr
16
+ from fastrtc import Stream, AsyncAudioVideoStreamHandler, get_cloudflare_turn_credentials_async, ReplyOnPause
 
 
 
 
 
17
  from google import genai
18
+ from google.genai import types
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ # Environment variable for API key
21
+ API_KEY = os.getenv("GEMINI_API_KEY", "")
22
 
23
+ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
24
+ """
25
+ Real-time screen assistant implementing the refactoring instructions.
26
+
27
+ Features:
28
+ - Google GenAI Live API integration
29
+ - Real-time audio/video streaming via FastRTC
30
+ - Voice activity detection with ReplyOnPause
31
+ - Intelligent frame sampling for screen sharing
32
+ - Cloudflare TURN server support for HF Spaces
33
+ """
34
+
35
+ def __init__(self):
36
  super().__init__(
37
+ expected_layout="mono",
38
+ output_sample_rate=24000,
39
+ input_sample_rate=16000
40
  )
 
 
41
  self.session = None
42
  self.last_frame_time = 0
43
+ self.audio_queue = asyncio.Queue()
44
+ self.connected = False
45
+ self.frame_interval = 1.0 # 1 FPS as per instructions
46
+
 
47
  async def start_up(self):
48
+ """Initialize Google GenAI Live session as per Task 8-10"""
49
+ try:
50
+ # Re-check environment variable in case it was set after import
51
+ current_api_key = os.getenv("GEMINI_API_KEY", "")
52
+ if not current_api_key:
53
+ print("❌ No GEMINI_API_KEY found in environment")
54
+ return
55
+
56
+ # Initialize client with v1alpha API (Task 8)
57
+ client = genai.Client(
58
+ api_key=current_api_key,
59
+ http_options={"api_version": "v1alpha"}
60
+ )
61
+
62
+ # Configure live session (Task 9) - minimal working config
63
+ from google.genai.types import LiveConnectConfig
64
+
65
+ # Start with minimal config to avoid WebSocket errors
66
+ config = LiveConnectConfig(
67
+ system_instruction=(
68
+ "You are a helpful real-time assistant who watches the user's screen and provides "
69
+ "guidance on using software. Give clear, step-by-step instructions based on what "
70
+ "you see and hear. Be proactive in offering assistance."
71
+ )
72
+ )
73
+
74
+ # Connect to Live API (Task 10) - using async context manager
75
+ self.session_context = client.aio.live.connect(
76
+ model="gemini-2.0-flash-live-001",
77
+ config=config
78
+ )
79
+ self.session = await self.session_context.__aenter__()
80
+
81
+ self.connected = True
82
+ print("βœ… Connected to Google GenAI Live API")
83
+
84
+ # Start response handler (Task 13)
85
+ self.response_task = asyncio.create_task(self._handle_responses())
86
+
87
+ except Exception as e:
88
+ print(f"❌ Failed to connect to GenAI: {e}")
89
+ self.connected = False
90
+
91
+ async def _handle_responses(self):
92
+ """Handle AI responses as per Task 12-13"""
93
+ try:
94
+ # The Google GenAI Live API provides an async iterator through session.receive()
95
+ # We need to handle this as a streaming response
96
+ while self.connected and self.session:
97
  try:
98
+ # Get the next message from the session
99
+ response_stream = self.session.receive()
100
+
101
+ # Check if this is an async iterator or needs to be awaited
102
+ if hasattr(response_stream, '__aiter__'):
103
+ # It's an async iterator
104
+ async for msg in response_stream:
105
+ if not self.connected:
106
+ break
107
+
108
+ if msg.data: # Audio response
109
+ # Convert to numpy for FastRTC (Task 13)
110
+ audio_array = np.frombuffer(msg.data, dtype=np.int16)
111
+ if len(audio_array) > 0:
112
+ audio_array = audio_array.reshape(1, -1)
113
+ await self.audio_queue.put(audio_array)
114
+
115
+ if msg.text: # Text response
116
+ print(f"πŸ€– AI: {msg.text}")
117
+ else:
118
+ # It's a single response that needs to be awaited
119
+ msg = await response_stream
120
+ if msg:
121
+ if msg.data: # Audio response
122
+ audio_array = np.frombuffer(msg.data, dtype=np.int16)
123
+ if len(audio_array) > 0:
124
+ audio_array = audio_array.reshape(1, -1)
125
+ await self.audio_queue.put(audio_array)
126
+
127
+ if msg.text: # Text response
128
+ print(f"πŸ€– AI: {msg.text}")
129
+
130
+ except Exception as inner_e:
131
+ if "connection" in str(inner_e).lower() or "closed" in str(inner_e).lower():
132
+ print("πŸ”΄ Connection closed, stopping response handler")
133
+ break
134
+ else:
135
+ print(f"⚠️ Response handling error: {inner_e}")
136
+ await asyncio.sleep(0.1) # Brief pause before retry
137
+
138
+ except Exception as e:
139
+ print(f"❌ Error handling AI responses: {e}")
140
+
141
+ async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]):
142
+ """Handle microphone audio (Task 11)"""
143
+ if not self.connected or not self.session:
144
+ return
145
+
146
+ try:
147
+ _, audio_np = frame
148
+ audio_bytes = audio_np.tobytes()
149
+
150
+ # Send audio to GenAI Live API using new non-deprecated method
151
+ await self.session.send_realtime_input(
152
+ input=types.Blob(
153
+ data=audio_bytes,
154
+ mime_type="audio/pcm;rate=16000"
155
+ )
156
+ )
157
+ except Exception as e:
158
+ print(f"❌ Error sending audio: {e}")
159
+
160
+ async def video_receive(self, frame: npt.NDArray[np.float32]):
161
+ """Handle screen video frames (Task 11-12)"""
162
+ if not self.connected or not self.session:
163
+ return
164
+
165
+ try:
166
+ # Throttle to 1 FPS as per instructions
167
+ current_time = time.time()
168
+ if current_time - self.last_frame_time < self.frame_interval:
169
+ return
170
+
171
+ self.last_frame_time = current_time
172
+
173
+ # Convert float32 frame to uint8 for JPEG encoding
174
+ if frame.dtype == np.float32:
175
+ # Assuming frame is in range [0, 1], convert to [0, 255]
176
+ frame_uint8 = (frame * 255).astype(np.uint8)
177
+ else:
178
+ frame_uint8 = frame.astype(np.uint8)
179
+
180
+ # Check for empty frame before encoding
181
+ if frame_uint8.size == 0 or frame_uint8.shape[0] == 0 or frame_uint8.shape[1] == 0:
182
+ return
183
+
184
+ # Encode as JPEG (Task 12)
185
+ try:
186
+ success, jpg_bytes = cv2.imencode('.jpg', frame_uint8, [cv2.IMWRITE_JPEG_QUALITY, 80])
187
+ if not success:
188
+ return
189
+ except cv2.error:
190
+ # Handle OpenCV encoding errors gracefully
191
+ return
192
+
193
+ # Send to GenAI using new non-deprecated method
194
+ await self.session.send_realtime_input(
195
+ input=types.Blob(
196
+ data=jpg_bytes.tobytes(),
197
+ mime_type="image/jpeg"
198
+ )
199
+ )
200
+
201
+ except Exception as e:
202
+ print(f"❌ Error sending video frame: {e}")
203
+
204
+ async def emit(self):
205
+ """Emit audio back to user (Task 13)"""
206
+ try:
207
+ audio_chunk = self.audio_queue.get_nowait()
208
+ return (24000, audio_chunk)
209
+ except asyncio.QueueEmpty:
210
+ return None
211
+
212
+ def copy(self):
213
+ """Copy method required by FastRTC AsyncAudioVideoStreamHandler"""
214
+ # Return a new instance with same configuration
215
+ new_instance = RealTimeScreenAssistant()
216
+ new_instance.frame_interval = self.frame_interval
217
+ return new_instance
218
+
219
  async def video_emit(self):
220
+ """Video emit method required by FastRTC AsyncAudioVideoStreamHandler"""
221
+ # For this use case, we don't emit video back to user
222
+ return None
223
+
224
+ async def shutdown(self):
225
+ """Clean shutdown (Task 17)"""
226
+ self.connected = False
227
+
228
+ # Cancel response handler task if it exists
229
+ if hasattr(self, 'response_task') and not self.response_task.done():
230
+ self.response_task.cancel()
231
+ try:
232
+ await self.response_task
233
+ except asyncio.CancelledError:
234
+ pass
235
+
236
  if self.session:
237
+ try:
238
+ # Properly close the session using context manager
239
+ if hasattr(self, 'session_context'):
240
+ await self.session_context.__aexit__(None, None, None)
241
+ else:
242
+ await self.session.close()
243
+ print("πŸ”΄ Disconnected from GenAI Live API")
244
+ except Exception as e:
245
+ print(f"❌ Error during shutdown: {e}")
246
+
247
+ self.session = None
248
+ if hasattr(self, 'session_context'):
249
+ self.session_context = None
250
 
251
+ # Global state
252
+ app_state = {"stream": None, "handler": None, "connected": False}
 
 
 
253
 
254
+ def initialize_real_time_assistant():
255
+ """Initialize the real-time assistant (Task 26-29)"""
256
+ try:
257
+ # Create handler
258
+ handler = RealTimeScreenAssistant()
259
+ app_state["handler"] = handler
260
+
261
+ # Create stream with Cloudflare TURN (Task 22-23)
262
+ stream = Stream(
263
+ handler=ReplyOnPause(handler), # Voice activity detection (Task 3)
264
+ modality="audio-video",
265
+ mode="send-receive",
266
+ rtc_configuration=get_cloudflare_turn_credentials_async,
267
+ time_limit=300 # 5 minute limit for Spaces
268
+ )
269
+
270
+ app_state["stream"] = stream
271
+ return stream
272
+
273
+ except Exception as e:
274
+ print(f"❌ Error creating stream: {e}")
275
+ return None
276
 
277
+ def handle_connect():
278
+ """Connect button handler (Task 16)"""
279
+ # Re-check environment variable in case it was set after import
280
+ current_api_key = os.getenv("GEMINI_API_KEY", "")
281
+ if not current_api_key:
282
+ return "❌ Please set GEMINI_API_KEY environment variable"
283
+
284
+ if app_state["connected"]:
285
+ return "βœ… Already connected - session is active"
286
+
287
+ app_state["connected"] = True
288
+ return "βœ… Connecting... Please allow microphone and camera permissions"
289
 
290
+ def handle_disconnect():
291
+ """Disconnect button handler (Task 17)"""
292
+ if app_state["handler"] and app_state["connected"]:
293
+ asyncio.create_task(app_state["handler"].shutdown())
294
+ app_state["connected"] = False
295
+ return "πŸ”΄ Disconnected from AI assistant"
296
+
297
+ return "Already disconnected"
 
 
 
 
 
 
 
 
298
 
299
+ # Screen sharing JavaScript (Task 16)
300
+ screen_share_js = """
301
+ async function shareScreen() {
302
+ try {
303
+ const screenStream = await navigator.mediaDevices.getDisplayMedia({
304
+ video: {
305
+ mediaSource: 'screen',
306
+ width: { ideal: 1280 },
307
+ height: { ideal: 720 }
308
+ },
309
+ audio: false
310
+ });
311
+
312
+ const videoElements = document.querySelectorAll('video');
313
+ const webrtcVideo = Array.from(videoElements).find(video =>
314
+ video.srcObject && video.srcObject.getVideoTracks().length > 0
315
+ );
316
+
317
+ if (webrtcVideo && webrtcVideo.srcObject) {
318
+ const videoTrack = screenStream.getVideoTracks()[0];
319
+ const currentTrack = webrtcVideo.srcObject.getVideoTracks()[0];
320
+
321
+ webrtcVideo.srcObject.removeTrack(currentTrack);
322
+ webrtcVideo.srcObject.addTrack(videoTrack);
323
+
324
+ videoTrack.onended = () => {
325
+ console.log('Screen sharing ended');
326
+ };
327
+
328
+ return "πŸ–₯️ Screen sharing started";
329
+ } else {
330
+ return "❌ Could not find video element";
331
+ }
332
+
333
+ } catch (error) {
334
+ console.error('Screen sharing error:', error);
335
+ if (error.name === 'NotAllowedError') {
336
+ return "❌ Screen sharing permission denied";
337
+ } else {
338
+ return `❌ Screen sharing failed: ${error.message}`;
339
+ }
340
+ }
341
+ }
342
 
343
+ return shareScreen();
344
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
+ def create_interface():
347
+ """Create main interface (Task 26-30)"""
348
+
349
+ # Initialize stream
350
+ stream = initialize_real_time_assistant()
351
+
352
+ with gr.Blocks(
353
+ title="Real-Time Screen Assistant",
354
+ theme=gr.themes.Soft()
355
+ ) as demo:
356
+
357
+ gr.Markdown("# πŸ–₯️ Real-Time Screen Assistant")
358
+ gr.Markdown("""
359
+ **Live AI assistant with screen sharing and voice interaction**
360
+
361
+ Refactored according to instructions for real-time capabilities using:
362
+ - Google GenAI Live API for multimodal AI processing
363
+ - FastRTC for low-latency audio/video streaming
364
+ - Voice activity detection with ReplyOnPause
365
+ - Cloudflare TURN servers for HF Spaces compatibility
366
+ """)
367
+
368
+ # Status display
369
+ status_display = gr.Textbox(
370
+ label="Status",
371
+ value="Ready to connect - Click Connect to start real-time session",
372
+ interactive=False
373
  )
374
+
375
+ # Control buttons (Task 3, 16-17)
376
+ with gr.Row():
377
+ connect_btn = gr.Button("πŸ”— Connect", variant="primary")
378
+ screen_btn = gr.Button("πŸ–₯️ Show Your Screen", variant="secondary")
379
+ disconnect_btn = gr.Button("πŸ”΄ Disconnect", variant="stop")
380
+
381
+ # Stream interface
382
+ if stream and hasattr(stream, 'ui'):
383
+ gr.Markdown("### πŸ“‘ Live Stream")
384
+ # Mount the FastRTC stream UI
385
+ stream_ui = stream.ui
386
+ else:
387
+ stream_ui = gr.HTML("<div>⚠️ Stream initialization failed</div>")
388
+
389
+ # Instructions (Task 1-3)
390
+ with gr.Accordion("πŸ“‹ Instructions", open=True):
391
+ gr.Markdown("""
392
+ **How to use the real-time assistant:**
393
+
394
+ 1. **Connect**: Click Connect to start the AI session
395
+ 2. **Permissions**: Allow microphone and camera access
396
+ 3. **Show Screen**: Click "Show Your Screen" to share your screen
397
+ 4. **Voice Interaction**: Simply speak - the AI will respond
398
+ 5. **Real-time Guidance**: AI sees your screen and provides live help
399
+ 6. **Disconnect**: Click Disconnect when finished
400
+
401
+ **Features implemented from refactoring instructions:**
402
+ - βœ… FastRTC WebRTC streaming (Task 2)
403
+ - βœ… Google GenAI Live API integration (Task 7-15)
404
+ - βœ… Connect/Show Screen/Disconnect controls (Task 3, 16-17)
405
+ - βœ… Voice activity detection with ReplyOnPause (Task 3)
406
+ - βœ… Screen sharing via getDisplayMedia (Task 6)
407
+ - βœ… Real-time advice generation (Task 18-21)
408
+ - βœ… Cloudflare TURN for HF Spaces (Task 22-23)
409
+ """)
410
+
411
+ # Privacy notice (Task 24-25)
412
+ with gr.Accordion("πŸ”’ Privacy & Security", open=False):
413
+ gr.Markdown("""
414
+ **Privacy Notice:**
415
+ - Screen content and voice are processed by Google's AI services
416
+ - Data is transmitted securely via encrypted WebRTC connections
417
+ - No permanent storage - all processing is real-time
418
+ - You control what is shared and can disconnect anytime
419
+
420
+ **Technical Details:**
421
+ - Uses Google Gemini Live API for real-time multimodal processing
422
+ - FastRTC provides low-latency WebRTC streaming
423
+ - Cloudflare TURN servers ensure reliable connectivity on HF Spaces
424
+ - Voice activity detection prevents interruptions
425
+ """)
426
+
427
+ # Wire up controls
428
+ connect_btn.click(
429
+ fn=handle_connect,
430
+ outputs=[status_display]
431
+ )
432
+
433
+ screen_btn.click(
434
+ fn=lambda: "πŸ–₯️ Requesting screen share...",
435
+ outputs=[status_display],
436
+ js=screen_share_js
437
+ )
438
+
439
+ disconnect_btn.click(
440
+ fn=handle_disconnect,
441
+ outputs=[status_display]
442
+ )
443
+
444
+ return demo
445
 
446
+ # Main execution
 
 
447
  if __name__ == "__main__":
448
+ print("πŸ–₯️ Real-Time Screen Assistant")
449
+ print("=" * 50)
450
+ print("Refactored according to instructions for:")
451
+ print("- Google GenAI Live API integration")
452
+ print("- FastRTC real-time streaming")
453
+ print("- Voice activity detection")
454
+ print("- Screen sharing capabilities")
455
+ print("- Cloudflare TURN for HF Spaces")
456
+
457
+ if not API_KEY:
458
+ print("\n⚠️ No GEMINI_API_KEY environment variable found")
459
+ print("Please set your Google AI API key:")
460
+ print("export GEMINI_API_KEY='your-api-key-here'")
461
  else:
462
+ print(f"\nβœ… API key configured (length: {len(API_KEY)})")
463
+
464
+ print("\nπŸš€ Starting real-time assistant...")
465
+
466
+ try:
467
+ demo = create_interface()
468
+ demo.launch(
469
+ server_name="0.0.0.0",
470
+ server_port=7860,
471
+ share=False,
472
+ show_error=True
473
+ )
474
+ except Exception as e:
475
+ print(f"❌ Failed to launch: {e}")
476
+ print("Ensure all dependencies are installed: pip install -r requirements.txt")