Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,32 +1,19 @@
|
|
1 |
-
"""
|
2 |
-
|
3 |
-
This is the PREMIUM, BEST WORKING version with comprehensive real-time handlers:
|
4 |
-
1. Continuous audio flow from user β model
|
5 |
-
2. Model audio output β user
|
6 |
-
3. Screen data streaming β model
|
7 |
-
4. Text responses from system β user
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
- Voice activity detection with intelligent filtering
|
13 |
-
- Continuous screen capture with adaptive throttling
|
14 |
-
- AI response delivery system (audio + text)
|
15 |
-
- Background task management with proper cleanup
|
16 |
-
- Enhanced error handling and recovery
|
17 |
-
- 300s timeout for real-time behavior
|
18 |
"""
|
19 |
|
20 |
-
import asyncio
|
21 |
import os
|
|
|
22 |
import time
|
23 |
-
from collections import deque
|
24 |
-
|
25 |
-
import cv2
|
26 |
-
import gradio as gr
|
27 |
import numpy as np
|
28 |
import numpy.typing as npt
|
29 |
-
|
|
|
|
|
30 |
from google import genai
|
31 |
from google.genai import types
|
32 |
|
@@ -34,633 +21,502 @@ from google.genai import types
|
|
34 |
API_KEY = os.getenv("GEMINI_API_KEY", "")
|
35 |
|
36 |
class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
37 |
-
"""Premium Real-time screen assistant with complete frontend integration.
|
38 |
-
|
39 |
-
Real-time Frontend Integration Features:
|
40 |
-
- Continuous audio streaming with voice activity detection
|
41 |
-
- Real-time screen capture with intelligent throttling
|
42 |
-
- AI audio response processing and delivery
|
43 |
-
- Text response handling and display
|
44 |
-
- Background task management
|
45 |
-
- Enhanced error recovery
|
46 |
"""
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def __init__(self):
|
49 |
super().__init__(
|
50 |
-
expected_layout="mono",
|
51 |
-
output_sample_rate=24000,
|
52 |
input_sample_rate=16000
|
53 |
)
|
54 |
self.session = None
|
55 |
self.last_frame_time = 0
|
56 |
self.audio_queue = asyncio.Queue()
|
57 |
-
self.text_queue = asyncio.Queue()
|
58 |
self.connected = False
|
59 |
-
self.frame_interval = 1.0 #
|
60 |
-
|
61 |
-
# Enhanced features for premium version
|
62 |
-
self.conversation_history = deque(maxlen=20) # Keep last 20 exchanges
|
63 |
-
self.background_tasks = set() # Track background tasks
|
64 |
-
self.voice_activity_threshold = 0.01 # Voice activity detection threshold
|
65 |
-
self.consecutive_silent_frames = 0
|
66 |
-
self.max_silent_frames = 10 # Filter out silence
|
67 |
-
|
68 |
-
# Performance optimization
|
69 |
-
self.last_audio_level = 0.0
|
70 |
-
self.frame_skip_counter = 0
|
71 |
-
self.adaptive_quality = True
|
72 |
-
|
73 |
async def start_up(self):
|
74 |
-
"""
|
75 |
try:
|
|
|
76 |
current_api_key = os.getenv("GEMINI_API_KEY", "")
|
77 |
if not current_api_key:
|
78 |
print("β No GEMINI_API_KEY found in environment")
|
79 |
return
|
80 |
-
|
81 |
-
# Initialize client with
|
82 |
client = genai.Client(
|
83 |
api_key=current_api_key,
|
84 |
http_options={"api_version": "v1alpha"}
|
85 |
)
|
86 |
-
|
87 |
-
#
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
"
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
}]
|
103 |
-
},
|
104 |
-
"generation_config": {
|
105 |
-
"response_mime_type": "text/plain",
|
106 |
-
"temperature": 0.7,
|
107 |
-
"max_output_tokens": 512
|
108 |
-
}
|
109 |
-
}
|
110 |
-
|
111 |
-
# Connect with enhanced configuration
|
112 |
-
self.session = await client.aio.live.connect(
|
113 |
-
model="gemini-2.0-flash-live-preview",
|
114 |
config=config
|
115 |
)
|
116 |
-
|
|
|
117 |
self.connected = True
|
118 |
-
print("β
Connected to Google GenAI Live API
|
119 |
-
|
120 |
-
# Start
|
121 |
-
response_task = asyncio.create_task(self._handle_responses())
|
122 |
-
|
123 |
-
response_task.add_done_callback(self.background_tasks.discard)
|
124 |
-
|
125 |
except Exception as e:
|
126 |
print(f"β Failed to connect to GenAI: {e}")
|
127 |
self.connected = False
|
128 |
-
|
129 |
async def _handle_responses(self):
|
130 |
-
"""
|
131 |
try:
|
132 |
-
async
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
try:
|
137 |
-
#
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
except Exception as e:
|
172 |
-
print(f"β
|
173 |
-
|
174 |
async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]):
|
175 |
-
"""
|
176 |
if not self.connected or not self.session:
|
177 |
return
|
178 |
-
|
179 |
try:
|
180 |
_, audio_np = frame
|
181 |
-
|
182 |
-
# PREMIUM: Voice activity detection
|
183 |
-
audio_level = np.abs(audio_np.astype(np.float32)).mean()
|
184 |
-
self.last_audio_level = audio_level
|
185 |
-
|
186 |
-
# Filter out silence and background noise
|
187 |
-
if audio_level < self.voice_activity_threshold:
|
188 |
-
self.consecutive_silent_frames += 1
|
189 |
-
if self.consecutive_silent_frames < self.max_silent_frames:
|
190 |
-
return # Skip silent frames
|
191 |
-
else:
|
192 |
-
self.consecutive_silent_frames = 0
|
193 |
-
|
194 |
-
# Convert and send audio
|
195 |
audio_bytes = audio_np.tobytes()
|
196 |
-
|
197 |
-
#
|
198 |
await self.session.send_realtime_input(
|
199 |
input=types.Blob(
|
200 |
-
data=audio_bytes,
|
201 |
mime_type="audio/pcm;rate=16000"
|
202 |
)
|
203 |
)
|
204 |
-
|
205 |
-
# Track user interaction
|
206 |
-
self.conversation_history.append({
|
207 |
-
"timestamp": time.time(),
|
208 |
-
"type": "user_audio",
|
209 |
-
"audio_level": float(audio_level)
|
210 |
-
})
|
211 |
-
|
212 |
except Exception as e:
|
213 |
print(f"β Error sending audio: {e}")
|
214 |
-
|
215 |
async def video_receive(self, frame: npt.NDArray[np.float32]):
|
216 |
-
"""
|
217 |
if not self.connected or not self.session:
|
218 |
return
|
219 |
-
|
220 |
try:
|
221 |
-
#
|
222 |
current_time = time.time()
|
223 |
-
|
224 |
-
# Adaptive interval based on user activity
|
225 |
-
if hasattr(self, 'last_audio_level') and self.last_audio_level > 0.05:
|
226 |
-
# More frequent updates during active conversation
|
227 |
-
adaptive_interval = self.frame_interval * 0.5
|
228 |
-
else:
|
229 |
-
# Standard interval during quiet periods
|
230 |
-
adaptive_interval = self.frame_interval
|
231 |
-
|
232 |
-
if current_time - self.last_frame_time < adaptive_interval:
|
233 |
return
|
234 |
-
|
235 |
self.last_frame_time = current_time
|
236 |
-
|
237 |
-
#
|
238 |
if frame.dtype == np.float32:
|
|
|
239 |
frame_uint8 = (frame * 255).astype(np.uint8)
|
240 |
else:
|
241 |
frame_uint8 = frame.astype(np.uint8)
|
242 |
-
|
243 |
-
#
|
244 |
if frame_uint8.size == 0 or frame_uint8.shape[0] == 0 or frame_uint8.shape[1] == 0:
|
245 |
return
|
246 |
-
|
247 |
-
#
|
248 |
-
quality = 85 if self.adaptive_quality and self.last_audio_level > 0.02 else 75
|
249 |
-
|
250 |
try:
|
251 |
-
success, jpg_bytes = cv2.imencode('.jpg', frame_uint8, [cv2.IMWRITE_JPEG_QUALITY,
|
252 |
if not success:
|
253 |
return
|
254 |
except cv2.error:
|
|
|
255 |
return
|
256 |
-
|
257 |
-
# Send
|
258 |
await self.session.send_realtime_input(
|
259 |
input=types.Blob(
|
260 |
-
data=jpg_bytes.tobytes(),
|
261 |
mime_type="image/jpeg"
|
262 |
)
|
263 |
)
|
264 |
-
|
265 |
-
# Track screen activity
|
266 |
-
self.conversation_history.append({
|
267 |
-
"timestamp": time.time(),
|
268 |
-
"type": "screen_frame",
|
269 |
-
"quality": quality,
|
270 |
-
"size": len(jpg_bytes)
|
271 |
-
})
|
272 |
-
|
273 |
except Exception as e:
|
274 |
print(f"β Error sending video frame: {e}")
|
275 |
-
|
276 |
async def emit(self):
|
277 |
-
"""
|
278 |
try:
|
279 |
audio_chunk = self.audio_queue.get_nowait()
|
280 |
return (24000, audio_chunk)
|
281 |
except asyncio.QueueEmpty:
|
282 |
return None
|
283 |
-
|
284 |
-
async def get_latest_text(self):
|
285 |
-
"""PREMIUM: Get latest text response from AI"""
|
286 |
-
try:
|
287 |
-
text = self.text_queue.get_nowait()
|
288 |
-
return text
|
289 |
-
except asyncio.QueueEmpty:
|
290 |
-
return None
|
291 |
-
|
292 |
def copy(self):
|
293 |
-
"""
|
|
|
294 |
new_instance = RealTimeScreenAssistant()
|
295 |
new_instance.frame_interval = self.frame_interval
|
296 |
-
new_instance.voice_activity_threshold = self.voice_activity_threshold
|
297 |
-
new_instance.adaptive_quality = self.adaptive_quality
|
298 |
return new_instance
|
299 |
-
|
300 |
async def video_emit(self):
|
301 |
-
"""Video emit method
|
|
|
302 |
return None
|
303 |
-
|
304 |
async def shutdown(self):
|
305 |
-
"""
|
306 |
self.connected = False
|
307 |
-
|
308 |
-
# Cancel
|
309 |
-
|
310 |
-
|
311 |
-
task.cancel()
|
312 |
-
|
313 |
-
# Wait for task cleanup
|
314 |
-
if self.background_tasks:
|
315 |
-
await asyncio.gather(*self.background_tasks, return_exceptions=True)
|
316 |
-
self.background_tasks.clear()
|
317 |
-
|
318 |
-
# Clean up queues
|
319 |
-
while not self.audio_queue.empty():
|
320 |
try:
|
321 |
-
self.
|
322 |
-
except asyncio.
|
323 |
-
|
324 |
-
|
325 |
-
while not self.text_queue.empty():
|
326 |
-
try:
|
327 |
-
self.text_queue.get_nowait()
|
328 |
-
except asyncio.QueueEmpty:
|
329 |
-
break
|
330 |
-
|
331 |
-
# Clear conversation history
|
332 |
-
self.conversation_history.clear()
|
333 |
-
|
334 |
-
# Close session
|
335 |
if self.session:
|
336 |
try:
|
337 |
-
|
|
|
|
|
|
|
|
|
338 |
print("π΄ Disconnected from GenAI Live API")
|
339 |
except Exception as e:
|
340 |
print(f"β Error during shutdown: {e}")
|
341 |
-
|
342 |
self.session = None
|
|
|
|
|
343 |
|
344 |
-
# Global state
|
345 |
-
app_state = {
|
346 |
-
"stream": None,
|
347 |
-
"handler": None,
|
348 |
-
"connected": False,
|
349 |
-
"last_status": "Ready to connect",
|
350 |
-
"stats": {"audio_sent": 0, "frames_sent": 0, "responses_received": 0}
|
351 |
-
}
|
352 |
|
353 |
def initialize_real_time_assistant():
|
354 |
-
"""
|
355 |
try:
|
|
|
356 |
handler = RealTimeScreenAssistant()
|
357 |
app_state["handler"] = handler
|
358 |
-
|
359 |
-
#
|
360 |
stream = Stream(
|
361 |
-
handler=ReplyOnPause(handler), # Voice activity detection
|
362 |
-
modality="audio-video",
|
363 |
mode="send-receive",
|
364 |
rtc_configuration=get_cloudflare_turn_credentials_async,
|
365 |
-
time_limit=300
|
366 |
-
ui_args={
|
367 |
-
"title": "Premium Real-Time Assistant",
|
368 |
-
"subtitle": "Audio-Video Streaming with Gemini 2.0",
|
369 |
-
"hide_title": False
|
370 |
-
}
|
371 |
)
|
372 |
-
|
373 |
app_state["stream"] = stream
|
374 |
return stream
|
375 |
-
|
376 |
except Exception as e:
|
377 |
print(f"β Error creating stream: {e}")
|
378 |
return None
|
379 |
|
380 |
-
|
381 |
-
"""
|
|
|
382 |
current_api_key = os.getenv("GEMINI_API_KEY", "")
|
383 |
if not current_api_key:
|
384 |
return "β Please set GEMINI_API_KEY environment variable"
|
385 |
-
|
386 |
if app_state["connected"]:
|
387 |
return "β
Already connected - session is active"
|
|
|
|
|
|
|
388 |
|
389 |
-
|
390 |
-
|
391 |
-
await app_state["handler"].start_up()
|
392 |
-
app_state["connected"] = True
|
393 |
-
app_state["last_status"] = "Connected to GenAI Live API"
|
394 |
-
return "β
Connected to GenAI Live API - Ready for real-time interaction!"
|
395 |
-
else:
|
396 |
-
return "β Handler not initialized"
|
397 |
-
except Exception as e:
|
398 |
-
app_state["connected"] = False
|
399 |
-
return f"β Connection failed: {str(e)}"
|
400 |
-
|
401 |
-
def handle_connect():
|
402 |
-
"""Sync wrapper for connection"""
|
403 |
-
app_state["connected"] = True # Optimistic update for UI
|
404 |
-
app_state["last_status"] = "Initiating connection..."
|
405 |
-
|
406 |
-
# Start async connection
|
407 |
-
asyncio.create_task(handle_connect_async())
|
408 |
-
return "π Initiating connection to GenAI Live API..."
|
409 |
-
|
410 |
-
async def handle_disconnect_async():
|
411 |
-
"""PREMIUM: Enhanced async disconnect handler"""
|
412 |
if app_state["handler"] and app_state["connected"]:
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
app_state["last_status"] = "Disconnected"
|
418 |
-
return "π΄ Disconnected from AI assistant"
|
419 |
-
except Exception as e:
|
420 |
-
return f"β Error during disconnect: {str(e)}"
|
421 |
return "Already disconnected"
|
422 |
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
|
439 |
def create_interface():
|
440 |
-
"""
|
441 |
-
|
|
|
442 |
stream = initialize_real_time_assistant()
|
443 |
-
|
444 |
with gr.Blocks(
|
445 |
-
title="Real-Time Screen Assistant
|
446 |
theme=gr.themes.Soft()
|
447 |
) as demo:
|
448 |
-
|
449 |
-
gr.Markdown("#
|
450 |
gr.Markdown("""
|
451 |
-
**π―
|
452 |
-
|
453 |
-
**
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
**
|
460 |
-
- π§
|
461 |
-
-
|
462 |
-
-
|
463 |
-
-
|
464 |
-
- π Performance monitoring and optimization
|
465 |
-
- π‘οΈ Enhanced error handling and recovery
|
466 |
""")
|
467 |
-
|
468 |
-
#
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
label="π Performance Stats",
|
477 |
-
value="Audio: 0 | Frames: 0 | Responses: 0",
|
478 |
-
interactive=False
|
479 |
-
)
|
480 |
-
|
481 |
-
# PREMIUM: Enhanced control panel
|
482 |
with gr.Row():
|
483 |
-
connect_btn = gr.Button("π Connect
|
|
|
|
|
484 |
disconnect_btn = gr.Button("π΄ Disconnect", variant="stop")
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
screen_share_btn = gr.Button("π₯οΈ Share Screen", variant="secondary")
|
489 |
-
|
490 |
-
# PREMIUM: Real-time streaming interface
|
491 |
-
gr.Markdown("### π‘ Premium Real-Time Stream")
|
492 |
-
|
493 |
if stream:
|
494 |
-
#
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
)
|
502 |
-
|
503 |
-
video_stream = gr.Image(
|
504 |
-
streaming=True,
|
505 |
-
label="π₯οΈ Screen Capture (Adaptive Quality)",
|
506 |
-
interactive=True
|
507 |
-
)
|
508 |
-
|
509 |
-
# PREMIUM: Connect streaming handlers
|
510 |
-
audio_stream.stream(
|
511 |
-
fn=lambda audio: app_state["handler"].receive(audio) if app_state["handler"] and app_state["connected"] else None,
|
512 |
-
inputs=[audio_stream],
|
513 |
-
outputs=[],
|
514 |
-
time_limit=300, # Real-time optimized
|
515 |
-
concurrency_limit=5
|
516 |
-
)
|
517 |
-
|
518 |
-
video_stream.stream(
|
519 |
-
fn=lambda frame: app_state["handler"].video_receive(frame) if app_state["handler"] and app_state["connected"] else None,
|
520 |
-
inputs=[video_stream],
|
521 |
-
outputs=[],
|
522 |
-
time_limit=300, # Real-time optimized
|
523 |
-
concurrency_limit=3
|
524 |
-
)
|
525 |
-
|
526 |
-
# PREMIUM: AI response display
|
527 |
-
ai_response_display = gr.Textbox(
|
528 |
-
label="π€ AI Response Stream",
|
529 |
-
value="AI responses will appear here...",
|
530 |
-
interactive=False,
|
531 |
-
max_lines=10
|
532 |
-
)
|
533 |
-
|
534 |
-
# PREMIUM: Audio output
|
535 |
-
ai_audio_output = gr.Audio(
|
536 |
-
label="π AI Voice Response",
|
537 |
-
autoplay=True,
|
538 |
-
streaming=True
|
539 |
-
)
|
540 |
-
|
541 |
-
# Connect AI response handlers
|
542 |
-
ai_audio_output.stream(
|
543 |
-
fn=lambda: app_state["handler"].emit() if app_state["handler"] and app_state["connected"] else None,
|
544 |
-
inputs=[],
|
545 |
-
outputs=[ai_audio_output],
|
546 |
-
time_limit=300
|
547 |
-
)
|
548 |
-
|
549 |
else:
|
550 |
-
gr.HTML("<div>β οΈ
|
551 |
-
|
552 |
-
#
|
553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
554 |
gr.Markdown("""
|
555 |
-
**How to use the
|
556 |
-
|
557 |
-
1. **Connect**: Click
|
558 |
-
2. **Permissions**: Allow microphone and camera access
|
559 |
-
3. **
|
560 |
-
4. **
|
561 |
-
5. **Real-time
|
562 |
-
6. **
|
563 |
-
|
564 |
-
**
|
565 |
-
- β
|
566 |
-
- β
|
567 |
-
- β
|
568 |
-
- β
|
569 |
-
- β
|
570 |
-
- β
|
|
|
571 |
""")
|
572 |
-
|
573 |
-
#
|
574 |
-
with gr.Accordion("
|
575 |
gr.Markdown("""
|
576 |
-
**
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
```
|
588 |
-
|
589 |
-
**2. Model Audio Output (Model β User):**
|
590 |
-
```python
|
591 |
-
# AI response processing with queue management
|
592 |
-
audio_array = np.frombuffer(response.data, dtype=np.int16)
|
593 |
-
await audio_queue.put(audio_array.reshape(1, -1))
|
594 |
-
```
|
595 |
-
|
596 |
-
**3. Screen Data Streaming (Screen β Model):**
|
597 |
-
```python
|
598 |
-
# Adaptive throttling based on activity
|
599 |
-
adaptive_interval = frame_interval * (0.5 if active else 1.0)
|
600 |
-
# Quality optimization: 85% for active, 75% for quiet
|
601 |
-
```
|
602 |
-
|
603 |
-
**4. Text Response Delivery (System β User):**
|
604 |
-
```python
|
605 |
-
# Conversation history with timestamps
|
606 |
-
conversation_history.append({
|
607 |
-
"timestamp": time.time(),
|
608 |
-
"type": "ai_response",
|
609 |
-
"content": response.text
|
610 |
-
})
|
611 |
-
```
|
612 |
-
|
613 |
-
**Premium Optimizations:**
|
614 |
-
- Background task management with proper cleanup
|
615 |
-
- Enhanced error handling and recovery
|
616 |
-
- Performance monitoring and adaptive quality
|
617 |
-
- 300s timeout optimized for real-time behavior
|
618 |
""")
|
619 |
-
|
620 |
-
# Wire up
|
621 |
connect_btn.click(
|
622 |
fn=handle_connect,
|
623 |
outputs=[status_display]
|
624 |
)
|
625 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
626 |
disconnect_btn.click(
|
627 |
fn=handle_disconnect,
|
628 |
outputs=[status_display]
|
629 |
)
|
630 |
-
|
631 |
-
# Initial load of connection status
|
632 |
-
demo.load(
|
633 |
-
fn=get_connection_status,
|
634 |
-
outputs=[stats_display]
|
635 |
-
)
|
636 |
-
|
637 |
return demo
|
638 |
|
639 |
# Main execution
|
640 |
if __name__ == "__main__":
|
641 |
-
print("
|
642 |
-
print("=" *
|
643 |
-
print("
|
644 |
-
print("
|
645 |
-
print("
|
646 |
-
print("
|
647 |
-
print("
|
648 |
-
print("
|
649 |
-
|
650 |
-
print(" - Adaptive screen capture with quality optimization")
|
651 |
-
print(" - Background task management with cleanup")
|
652 |
-
print(" - Enhanced error handling and recovery")
|
653 |
-
print(" - 300s timeout optimized for real-time behavior")
|
654 |
-
|
655 |
if not API_KEY:
|
656 |
print("\nβ οΈ No GEMINI_API_KEY environment variable found")
|
657 |
print("Please set your Google AI API key:")
|
658 |
print("export GEMINI_API_KEY='your-api-key-here'")
|
659 |
else:
|
660 |
-
print(f"\nβ
API key configured (
|
661 |
-
|
662 |
-
print("\nπ Starting
|
663 |
-
|
664 |
try:
|
665 |
demo = create_interface()
|
666 |
demo.launch(
|
@@ -671,4 +527,4 @@ if __name__ == "__main__":
|
|
671 |
)
|
672 |
except Exception as e:
|
673 |
print(f"β Failed to launch: {e}")
|
674 |
-
print("Ensure all dependencies are installed: pip install -r requirements.txt")
|
|
|
1 |
+
"""
|
2 |
+
Real-Time Screen Assistant - Refactored for Google GenAI Live API + FastRTC
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
This application transforms the original screenshot analyzer into a real-time
|
5 |
+
screen sharing assistant with voice interaction, following the refactoring
|
6 |
+
instructions for live streaming capabilities.
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
"""
|
8 |
|
|
|
9 |
import os
|
10 |
+
import asyncio
|
11 |
import time
|
|
|
|
|
|
|
|
|
12 |
import numpy as np
|
13 |
import numpy.typing as npt
|
14 |
+
import cv2
|
15 |
+
import gradio as gr
|
16 |
+
from fastrtc import Stream, AsyncAudioVideoStreamHandler, get_cloudflare_turn_credentials_async, ReplyOnPause
|
17 |
from google import genai
|
18 |
from google.genai import types
|
19 |
|
|
|
21 |
API_KEY = os.getenv("GEMINI_API_KEY", "")
|
22 |
|
23 |
class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
"""
|
25 |
+
Real-time screen assistant implementing the refactoring instructions.
|
26 |
+
|
27 |
+
Features:
|
28 |
+
- Google GenAI Live API integration
|
29 |
+
- Real-time audio/video streaming via FastRTC
|
30 |
+
- Voice activity detection with ReplyOnPause
|
31 |
+
- Intelligent frame sampling for screen sharing
|
32 |
+
- Cloudflare TURN server support for HF Spaces
|
33 |
+
"""
|
34 |
+
|
35 |
def __init__(self):
|
36 |
super().__init__(
|
37 |
+
expected_layout="mono",
|
38 |
+
output_sample_rate=24000,
|
39 |
input_sample_rate=16000
|
40 |
)
|
41 |
self.session = None
|
42 |
self.last_frame_time = 0
|
43 |
self.audio_queue = asyncio.Queue()
|
|
|
44 |
self.connected = False
|
45 |
+
self.frame_interval = 1.0 # 1 FPS as per instructions
|
46 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
async def start_up(self):
|
48 |
+
"""Initialize Google GenAI Live session as per Task 8-10"""
|
49 |
try:
|
50 |
+
# Re-check environment variable in case it was set after import
|
51 |
current_api_key = os.getenv("GEMINI_API_KEY", "")
|
52 |
if not current_api_key:
|
53 |
print("β No GEMINI_API_KEY found in environment")
|
54 |
return
|
55 |
+
|
56 |
+
# Initialize client with v1alpha API (Task 8)
|
57 |
client = genai.Client(
|
58 |
api_key=current_api_key,
|
59 |
http_options={"api_version": "v1alpha"}
|
60 |
)
|
61 |
+
|
62 |
+
# Configure live session (Task 9) - minimal working config
|
63 |
+
from google.genai.types import LiveConnectConfig
|
64 |
+
|
65 |
+
# Start with minimal config to avoid WebSocket errors
|
66 |
+
config = LiveConnectConfig(
|
67 |
+
system_instruction=(
|
68 |
+
"You are a helpful real-time assistant who watches the user's screen and provides "
|
69 |
+
"guidance on using software. Give clear, step-by-step instructions based on what "
|
70 |
+
"you see and hear. Be proactive in offering assistance."
|
71 |
+
)
|
72 |
+
)
|
73 |
+
|
74 |
+
# Connect to Live API (Task 10) - using async context manager
|
75 |
+
self.session_context = client.aio.live.connect(
|
76 |
+
model="gemini-2.0-flash-live-001",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
config=config
|
78 |
)
|
79 |
+
self.session = await self.session_context.__aenter__()
|
80 |
+
|
81 |
self.connected = True
|
82 |
+
print("β
Connected to Google GenAI Live API")
|
83 |
+
|
84 |
+
# Start response handler (Task 13)
|
85 |
+
self.response_task = asyncio.create_task(self._handle_responses())
|
86 |
+
|
|
|
|
|
87 |
except Exception as e:
|
88 |
print(f"β Failed to connect to GenAI: {e}")
|
89 |
self.connected = False
|
90 |
+
|
91 |
async def _handle_responses(self):
|
92 |
+
"""Handle AI responses as per Task 12-13"""
|
93 |
try:
|
94 |
+
# The Google GenAI Live API provides an async iterator through session.receive()
|
95 |
+
# We need to handle this as a streaming response
|
96 |
+
while self.connected and self.session:
|
|
|
97 |
try:
|
98 |
+
# Get the next message from the session
|
99 |
+
response_stream = self.session.receive()
|
100 |
+
|
101 |
+
# Check if this is an async iterator or needs to be awaited
|
102 |
+
if hasattr(response_stream, '__aiter__'):
|
103 |
+
# It's an async iterator
|
104 |
+
async for msg in response_stream:
|
105 |
+
if not self.connected:
|
106 |
+
break
|
107 |
+
|
108 |
+
if msg.data: # Audio response
|
109 |
+
# Convert to numpy for FastRTC (Task 13)
|
110 |
+
audio_array = np.frombuffer(msg.data, dtype=np.int16)
|
111 |
+
if len(audio_array) > 0:
|
112 |
+
audio_array = audio_array.reshape(1, -1)
|
113 |
+
await self.audio_queue.put(audio_array)
|
114 |
+
|
115 |
+
if msg.text: # Text response
|
116 |
+
print(f"π€ AI: {msg.text}")
|
117 |
+
else:
|
118 |
+
# It's a single response that needs to be awaited
|
119 |
+
msg = await response_stream
|
120 |
+
if msg:
|
121 |
+
if msg.data: # Audio response
|
122 |
+
audio_array = np.frombuffer(msg.data, dtype=np.int16)
|
123 |
+
if len(audio_array) > 0:
|
124 |
+
audio_array = audio_array.reshape(1, -1)
|
125 |
+
await self.audio_queue.put(audio_array)
|
126 |
+
|
127 |
+
if msg.text: # Text response
|
128 |
+
print(f"π€ AI: {msg.text}")
|
129 |
+
|
130 |
+
except Exception as inner_e:
|
131 |
+
if "connection" in str(inner_e).lower() or "closed" in str(inner_e).lower():
|
132 |
+
print("π΄ Connection closed, stopping response handler")
|
133 |
+
break
|
134 |
+
else:
|
135 |
+
print(f"β οΈ Response handling error: {inner_e}")
|
136 |
+
await asyncio.sleep(0.1) # Brief pause before retry
|
137 |
+
|
138 |
except Exception as e:
|
139 |
+
print(f"β Error handling AI responses: {e}")
|
140 |
+
|
141 |
async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]):
|
142 |
+
"""Handle microphone audio (Task 11)"""
|
143 |
if not self.connected or not self.session:
|
144 |
return
|
145 |
+
|
146 |
try:
|
147 |
_, audio_np = frame
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
audio_bytes = audio_np.tobytes()
|
149 |
+
|
150 |
+
# Send audio to GenAI Live API using new non-deprecated method
|
151 |
await self.session.send_realtime_input(
|
152 |
input=types.Blob(
|
153 |
+
data=audio_bytes,
|
154 |
mime_type="audio/pcm;rate=16000"
|
155 |
)
|
156 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
except Exception as e:
|
158 |
print(f"β Error sending audio: {e}")
|
159 |
+
|
160 |
async def video_receive(self, frame: npt.NDArray[np.float32]):
|
161 |
+
"""Handle screen video frames (Task 11-12)"""
|
162 |
if not self.connected or not self.session:
|
163 |
return
|
164 |
+
|
165 |
try:
|
166 |
+
# Throttle to 1 FPS as per instructions
|
167 |
current_time = time.time()
|
168 |
+
if current_time - self.last_frame_time < self.frame_interval:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
return
|
170 |
+
|
171 |
self.last_frame_time = current_time
|
172 |
+
|
173 |
+
# Convert float32 frame to uint8 for JPEG encoding
|
174 |
if frame.dtype == np.float32:
|
175 |
+
# Assuming frame is in range [0, 1], convert to [0, 255]
|
176 |
frame_uint8 = (frame * 255).astype(np.uint8)
|
177 |
else:
|
178 |
frame_uint8 = frame.astype(np.uint8)
|
179 |
+
|
180 |
+
# Check for empty frame before encoding
|
181 |
if frame_uint8.size == 0 or frame_uint8.shape[0] == 0 or frame_uint8.shape[1] == 0:
|
182 |
return
|
183 |
+
|
184 |
+
# Encode as JPEG (Task 12)
|
|
|
|
|
185 |
try:
|
186 |
+
success, jpg_bytes = cv2.imencode('.jpg', frame_uint8, [cv2.IMWRITE_JPEG_QUALITY, 80])
|
187 |
if not success:
|
188 |
return
|
189 |
except cv2.error:
|
190 |
+
# Handle OpenCV encoding errors gracefully
|
191 |
return
|
192 |
+
|
193 |
+
# Send to GenAI using new non-deprecated method
|
194 |
await self.session.send_realtime_input(
|
195 |
input=types.Blob(
|
196 |
+
data=jpg_bytes.tobytes(),
|
197 |
mime_type="image/jpeg"
|
198 |
)
|
199 |
)
|
200 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
except Exception as e:
|
202 |
print(f"β Error sending video frame: {e}")
|
203 |
+
|
204 |
async def emit(self):
|
205 |
+
"""Emit audio back to user (Task 13)"""
|
206 |
try:
|
207 |
audio_chunk = self.audio_queue.get_nowait()
|
208 |
return (24000, audio_chunk)
|
209 |
except asyncio.QueueEmpty:
|
210 |
return None
|
211 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
def copy(self):
|
213 |
+
"""Copy method required by FastRTC AsyncAudioVideoStreamHandler"""
|
214 |
+
# Return a new instance with same configuration
|
215 |
new_instance = RealTimeScreenAssistant()
|
216 |
new_instance.frame_interval = self.frame_interval
|
|
|
|
|
217 |
return new_instance
|
218 |
+
|
219 |
async def video_emit(self):
|
220 |
+
"""Video emit method required by FastRTC AsyncAudioVideoStreamHandler"""
|
221 |
+
# For this use case, we don't emit video back to user
|
222 |
return None
|
223 |
+
|
224 |
async def shutdown(self):
|
225 |
+
"""Clean shutdown (Task 17)"""
|
226 |
self.connected = False
|
227 |
+
|
228 |
+
# Cancel response handler task if it exists
|
229 |
+
if hasattr(self, 'response_task') and not self.response_task.done():
|
230 |
+
self.response_task.cancel()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
try:
|
232 |
+
await self.response_task
|
233 |
+
except asyncio.CancelledError:
|
234 |
+
pass
|
235 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
if self.session:
|
237 |
try:
|
238 |
+
# Properly close the session using context manager
|
239 |
+
if hasattr(self, 'session_context'):
|
240 |
+
await self.session_context.__aexit__(None, None, None)
|
241 |
+
else:
|
242 |
+
await self.session.close()
|
243 |
print("π΄ Disconnected from GenAI Live API")
|
244 |
except Exception as e:
|
245 |
print(f"β Error during shutdown: {e}")
|
246 |
+
|
247 |
self.session = None
|
248 |
+
if hasattr(self, 'session_context'):
|
249 |
+
self.session_context = None
|
250 |
|
251 |
+
# Global state
|
252 |
+
app_state = {"stream": None, "handler": None, "connected": False}
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
def initialize_real_time_assistant():
|
255 |
+
"""Initialize the real-time assistant (Task 26-29)"""
|
256 |
try:
|
257 |
+
# Create handler
|
258 |
handler = RealTimeScreenAssistant()
|
259 |
app_state["handler"] = handler
|
260 |
+
|
261 |
+
# Create stream with Cloudflare TURN (Task 22-23)
|
262 |
stream = Stream(
|
263 |
+
handler=ReplyOnPause(handler), # Voice activity detection (Task 3)
|
264 |
+
modality="audio-video",
|
265 |
mode="send-receive",
|
266 |
rtc_configuration=get_cloudflare_turn_credentials_async,
|
267 |
+
time_limit=300 # 5 minute limit for Spaces
|
|
|
|
|
|
|
|
|
|
|
268 |
)
|
269 |
+
|
270 |
app_state["stream"] = stream
|
271 |
return stream
|
272 |
+
|
273 |
except Exception as e:
|
274 |
print(f"β Error creating stream: {e}")
|
275 |
return None
|
276 |
|
277 |
+
def handle_connect():
|
278 |
+
"""Connect button handler (Task 16)"""
|
279 |
+
# Re-check environment variable in case it was set after import
|
280 |
current_api_key = os.getenv("GEMINI_API_KEY", "")
|
281 |
if not current_api_key:
|
282 |
return "β Please set GEMINI_API_KEY environment variable"
|
283 |
+
|
284 |
if app_state["connected"]:
|
285 |
return "β
Already connected - session is active"
|
286 |
+
|
287 |
+
app_state["connected"] = True
|
288 |
+
return "β
Connecting... Please allow microphone and camera permissions"
|
289 |
|
290 |
+
def handle_disconnect():
|
291 |
+
"""Disconnect button handler (Task 17)"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
if app_state["handler"] and app_state["connected"]:
|
293 |
+
asyncio.run(app_state["handler"].shutdown())
|
294 |
+
app_state["connected"] = False
|
295 |
+
return "π΄ Disconnected from AI assistant"
|
296 |
+
|
|
|
|
|
|
|
|
|
297 |
return "Already disconnected"
|
298 |
|
299 |
+
# Screen sharing JavaScript - Fixed syntax for HF Spaces
|
300 |
+
screen_share_js = '''
|
301 |
+
(async function() {
|
302 |
+
try {
|
303 |
+
if (!navigator.mediaDevices || !navigator.mediaDevices.getDisplayMedia) {
|
304 |
+
return "β Screen sharing not supported in this browser";
|
305 |
+
}
|
306 |
+
|
307 |
+
const stream = await navigator.mediaDevices.getDisplayMedia({
|
308 |
+
video: {
|
309 |
+
width: { ideal: 1920 },
|
310 |
+
height: { ideal: 1080 }
|
311 |
+
},
|
312 |
+
audio: false
|
313 |
+
});
|
314 |
+
|
315 |
+
// Find the video element from FastRTC
|
316 |
+
const videos = document.querySelectorAll('video');
|
317 |
+
let targetVideo = null;
|
318 |
+
|
319 |
+
for (let video of videos) {
|
320 |
+
if (video.srcObject && video.srcObject.getVideoTracks().length > 0) {
|
321 |
+
targetVideo = video;
|
322 |
+
break;
|
323 |
+
}
|
324 |
+
}
|
325 |
+
|
326 |
+
if (targetVideo && targetVideo.srcObject) {
|
327 |
+
// Replace the camera track with screen track
|
328 |
+
const screenTrack = stream.getVideoTracks()[0];
|
329 |
+
const sender = targetVideo.srcObject.getVideoTracks()[0];
|
330 |
+
|
331 |
+
// Remove old track and add screen track
|
332 |
+
targetVideo.srcObject.removeTrack(sender);
|
333 |
+
targetVideo.srcObject.addTrack(screenTrack);
|
334 |
+
|
335 |
+
screenTrack.onended = () => {
|
336 |
+
console.log("Screen sharing ended");
|
337 |
+
};
|
338 |
+
|
339 |
+
return "π₯οΈ Screen sharing started successfully!";
|
340 |
+
} else {
|
341 |
+
return "β Could not find video stream to replace";
|
342 |
+
}
|
343 |
+
|
344 |
+
} catch (error) {
|
345 |
+
console.error("Screen sharing error:", error);
|
346 |
+
if (error.name === "NotAllowedError") {
|
347 |
+
return "β Screen sharing permission denied - please allow screen access";
|
348 |
+
} else if (error.name === "NotSupportedError") {
|
349 |
+
return "β Screen sharing not supported in this environment";
|
350 |
+
} else {
|
351 |
+
return "β Screen sharing failed: " + error.message;
|
352 |
+
}
|
353 |
+
}
|
354 |
+
})()'''
|
355 |
|
356 |
def create_interface():
|
357 |
+
"""Create main interface (Task 26-30)"""
|
358 |
+
|
359 |
+
# Initialize stream
|
360 |
stream = initialize_real_time_assistant()
|
361 |
+
|
362 |
with gr.Blocks(
|
363 |
+
title="Real-Time Screen Assistant",
|
364 |
theme=gr.themes.Soft()
|
365 |
) as demo:
|
366 |
+
|
367 |
+
gr.Markdown("# π₯οΈ Real-Time Screen Assistant")
|
368 |
gr.Markdown("""
|
369 |
+
**π― LIVE AI that sees your screen and provides real-time guidance!**
|
370 |
+
|
371 |
+
**How it works:**
|
372 |
+
1. **Connect** - Links to Google's GenAI Live API for real-time AI processing
|
373 |
+
2. **Share Screen** - AI can see exactly what you're doing on your screen
|
374 |
+
3. **Voice Chat** - Talk naturally, AI responds with voice and sees everything
|
375 |
+
4. **Get Help** - Real-time assistance with software, coding, troubleshooting
|
376 |
+
|
377 |
+
**Tech Stack:**
|
378 |
+
- π§ Google GenAI Live API (multimodal real-time AI)
|
379 |
+
- πΉ FastRTC (low-latency screen/audio streaming)
|
380 |
+
- ποΈ Voice activity detection
|
381 |
+
- π Cloudflare TURN servers (HF Spaces optimized)
|
|
|
|
|
382 |
""")
|
383 |
+
|
384 |
+
# Status display
|
385 |
+
status_display = gr.Textbox(
|
386 |
+
label="Status",
|
387 |
+
value="Ready to connect - Click Connect to start real-time session",
|
388 |
+
interactive=False
|
389 |
+
)
|
390 |
+
|
391 |
+
# Control buttons (Task 3, 16-17)
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
with gr.Row():
|
393 |
+
connect_btn = gr.Button("π Connect", variant="primary")
|
394 |
+
mic_btn = gr.Button("ποΈ Test Microphone", variant="secondary")
|
395 |
+
screen_btn = gr.Button("π₯οΈ Show Your Screen", variant="secondary")
|
396 |
disconnect_btn = gr.Button("π΄ Disconnect", variant="stop")
|
397 |
+
|
398 |
+
# Stream interface - FastRTC UI for microphone and video
|
399 |
+
gr.Markdown("### π‘ Live Audio/Video Stream")
|
|
|
|
|
|
|
|
|
|
|
400 |
if stream:
|
401 |
+
# Mount the FastRTC stream UI - this provides microphone access
|
402 |
+
gr.HTML("""
|
403 |
+
<div id="fastrtc-container">
|
404 |
+
<p>ποΈ Microphone and video streaming handled by FastRTC</p>
|
405 |
+
<p>Click 'Test Microphone' and 'Show Your Screen' to activate</p>
|
406 |
+
</div>
|
407 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
else:
|
409 |
+
gr.HTML("<div>β οΈ Stream initialization failed - Check console for errors</div>")
|
410 |
+
|
411 |
+
# Microphone activation JavaScript
|
412 |
+
microphone_js = '''
|
413 |
+
(async function() {
|
414 |
+
try {
|
415 |
+
// Request microphone permission and start audio
|
416 |
+
const stream = await navigator.mediaDevices.getUserMedia({
|
417 |
+
audio: {
|
418 |
+
sampleRate: 16000,
|
419 |
+
channelCount: 1,
|
420 |
+
echoCancellation: true,
|
421 |
+
noiseSuppression: true
|
422 |
+
}
|
423 |
+
});
|
424 |
+
console.log("Microphone access granted");
|
425 |
+
return "ποΈ Microphone connected successfully";
|
426 |
+
} catch (error) {
|
427 |
+
console.error("Microphone error:", error);
|
428 |
+
if (error.name === "NotAllowedError") {
|
429 |
+
return "β Microphone permission denied - please allow microphone access";
|
430 |
+
} else {
|
431 |
+
return "β Microphone failed: " + error.message;
|
432 |
+
}
|
433 |
+
}
|
434 |
+
})()
|
435 |
+
'''
|
436 |
+
|
437 |
+
# Instructions (Task 1-3)
|
438 |
+
with gr.Accordion("π Instructions", open=True):
|
439 |
gr.Markdown("""
|
440 |
+
**How to use the real-time assistant:**
|
441 |
+
|
442 |
+
1. **Connect**: Click Connect to start the AI session
|
443 |
+
2. **Permissions**: Allow microphone and camera access
|
444 |
+
3. **Show Screen**: Click "Show Your Screen" to share your screen
|
445 |
+
4. **Voice Interaction**: Simply speak - the AI will respond
|
446 |
+
5. **Real-time Guidance**: AI sees your screen and provides live help
|
447 |
+
6. **Disconnect**: Click Disconnect when finished
|
448 |
+
|
449 |
+
**Features implemented from refactoring instructions:**
|
450 |
+
- β
FastRTC WebRTC streaming (Task 2)
|
451 |
+
- β
Google GenAI Live API integration (Task 7-15)
|
452 |
+
- β
Connect/Show Screen/Disconnect controls (Task 3, 16-17)
|
453 |
+
- β
Voice activity detection with ReplyOnPause (Task 3)
|
454 |
+
- β
Screen sharing via getDisplayMedia (Task 6)
|
455 |
+
- β
Real-time advice generation (Task 18-21)
|
456 |
+
- β
Cloudflare TURN for HF Spaces (Task 22-23)
|
457 |
""")
|
458 |
+
|
459 |
+
# Privacy notice (Task 24-25)
|
460 |
+
with gr.Accordion("π Privacy & Security", open=False):
|
461 |
gr.Markdown("""
|
462 |
+
**Privacy Notice:**
|
463 |
+
- Screen content and voice are processed by Google's AI services
|
464 |
+
- Data is transmitted securely via encrypted WebRTC connections
|
465 |
+
- No permanent storage - all processing is real-time
|
466 |
+
- You control what is shared and can disconnect anytime
|
467 |
+
|
468 |
+
**Technical Details:**
|
469 |
+
- Uses Google Gemini Live API for real-time multimodal processing
|
470 |
+
- FastRTC provides low-latency WebRTC streaming
|
471 |
+
- Cloudflare TURN servers ensure reliable connectivity on HF Spaces
|
472 |
+
- Voice activity detection prevents interruptions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
473 |
""")
|
474 |
+
|
475 |
+
# Wire up controls
|
476 |
connect_btn.click(
|
477 |
fn=handle_connect,
|
478 |
outputs=[status_display]
|
479 |
)
|
480 |
+
|
481 |
+
mic_btn.click(
|
482 |
+
fn=lambda: "ποΈ Testing microphone...",
|
483 |
+
outputs=[status_display],
|
484 |
+
js=microphone_js
|
485 |
+
)
|
486 |
+
|
487 |
+
screen_btn.click(
|
488 |
+
fn=lambda: "π₯οΈ Requesting screen share...",
|
489 |
+
outputs=[status_display],
|
490 |
+
js=screen_share_js
|
491 |
+
)
|
492 |
+
|
493 |
disconnect_btn.click(
|
494 |
fn=handle_disconnect,
|
495 |
outputs=[status_display]
|
496 |
)
|
497 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
return demo
|
499 |
|
500 |
# Main execution
|
501 |
if __name__ == "__main__":
|
502 |
+
print("π₯οΈ Real-Time Screen Assistant")
|
503 |
+
print("=" * 50)
|
504 |
+
print("Refactored according to instructions for:")
|
505 |
+
print("- Google GenAI Live API integration")
|
506 |
+
print("- FastRTC real-time streaming")
|
507 |
+
print("- Voice activity detection")
|
508 |
+
print("- Screen sharing capabilities")
|
509 |
+
print("- Cloudflare TURN for HF Spaces")
|
510 |
+
|
|
|
|
|
|
|
|
|
|
|
511 |
if not API_KEY:
|
512 |
print("\nβ οΈ No GEMINI_API_KEY environment variable found")
|
513 |
print("Please set your Google AI API key:")
|
514 |
print("export GEMINI_API_KEY='your-api-key-here'")
|
515 |
else:
|
516 |
+
print(f"\nβ
API key configured (length: {len(API_KEY)})")
|
517 |
+
|
518 |
+
print("\nπ Starting real-time assistant...")
|
519 |
+
|
520 |
try:
|
521 |
demo = create_interface()
|
522 |
demo.launch(
|
|
|
527 |
)
|
528 |
except Exception as e:
|
529 |
print(f"β Failed to launch: {e}")
|
530 |
+
print("Ensure all dependencies are installed: pip install -r requirements.txt")
|