arthrod commited on
Commit
bdb76ca
Β·
verified Β·
1 Parent(s): d5c1a40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +360 -504
app.py CHANGED
@@ -1,32 +1,19 @@
1
- """Real-Time Screen Assistant - Premium Edition with Complete Frontend Integration
2
-
3
- This is the PREMIUM, BEST WORKING version with comprehensive real-time handlers:
4
- 1. Continuous audio flow from user β†’ model
5
- 2. Model audio output β†’ user
6
- 3. Screen data streaming β†’ model
7
- 4. Text responses from system β†’ user
8
 
9
- Features:
10
- - Google GenAI Live API integration with enhanced configuration
11
- - Real-time audio/video streaming via FastRTC
12
- - Voice activity detection with intelligent filtering
13
- - Continuous screen capture with adaptive throttling
14
- - AI response delivery system (audio + text)
15
- - Background task management with proper cleanup
16
- - Enhanced error handling and recovery
17
- - 300s timeout for real-time behavior
18
  """
19
 
20
- import asyncio
21
  import os
 
22
  import time
23
- from collections import deque
24
-
25
- import cv2
26
- import gradio as gr
27
  import numpy as np
28
  import numpy.typing as npt
29
- from fastrtc import AsyncAudioVideoStreamHandler, ReplyOnPause, Stream, get_cloudflare_turn_credentials_async
 
 
30
  from google import genai
31
  from google.genai import types
32
 
@@ -34,633 +21,502 @@ from google.genai import types
34
  API_KEY = os.getenv("GEMINI_API_KEY", "")
35
 
36
  class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
37
- """Premium Real-time screen assistant with complete frontend integration.
38
-
39
- Real-time Frontend Integration Features:
40
- - Continuous audio streaming with voice activity detection
41
- - Real-time screen capture with intelligent throttling
42
- - AI audio response processing and delivery
43
- - Text response handling and display
44
- - Background task management
45
- - Enhanced error recovery
46
  """
47
-
 
 
 
 
 
 
 
 
 
48
  def __init__(self):
49
  super().__init__(
50
- expected_layout="mono",
51
- output_sample_rate=24000,
52
  input_sample_rate=16000
53
  )
54
  self.session = None
55
  self.last_frame_time = 0
56
  self.audio_queue = asyncio.Queue()
57
- self.text_queue = asyncio.Queue()
58
  self.connected = False
59
- self.frame_interval = 1.0 # Adaptive frame interval
60
-
61
- # Enhanced features for premium version
62
- self.conversation_history = deque(maxlen=20) # Keep last 20 exchanges
63
- self.background_tasks = set() # Track background tasks
64
- self.voice_activity_threshold = 0.01 # Voice activity detection threshold
65
- self.consecutive_silent_frames = 0
66
- self.max_silent_frames = 10 # Filter out silence
67
-
68
- # Performance optimization
69
- self.last_audio_level = 0.0
70
- self.frame_skip_counter = 0
71
- self.adaptive_quality = True
72
-
73
  async def start_up(self):
74
- """Enhanced startup with premium configuration"""
75
  try:
 
76
  current_api_key = os.getenv("GEMINI_API_KEY", "")
77
  if not current_api_key:
78
  print("❌ No GEMINI_API_KEY found in environment")
79
  return
80
-
81
- # Initialize client with premium configuration
82
  client = genai.Client(
83
  api_key=current_api_key,
84
  http_options={"api_version": "v1alpha"}
85
  )
86
-
87
- # PREMIUM: Enhanced configuration with all features enabled
88
- config = {
89
- "response_modalities": ["AUDIO", "TEXT"],
90
- "input_audio_transcription": {"model": "latest"},
91
- "output_audio_transcription": {"model": "latest"},
92
- "system_instruction": {
93
- "parts": [{
94
- "text": (
95
- "You are an expert real-time screen assistant with premium capabilities. "
96
- "You can see the user's screen continuously and hear their voice in real-time. "
97
- "Provide intelligent, proactive assistance based on what you observe. "
98
- "Give clear, actionable guidance for software usage, coding, troubleshooting, "
99
- "and any tasks you see the user working on. Be concise but comprehensive. "
100
- "Respond with both voice and text when helpful."
101
- )
102
- }]
103
- },
104
- "generation_config": {
105
- "response_mime_type": "text/plain",
106
- "temperature": 0.7,
107
- "max_output_tokens": 512
108
- }
109
- }
110
-
111
- # Connect with enhanced configuration
112
- self.session = await client.aio.live.connect(
113
- model="gemini-2.0-flash-live-preview",
114
  config=config
115
  )
116
-
 
117
  self.connected = True
118
- print("βœ… Connected to Google GenAI Live API (Premium Mode)")
119
-
120
- # Start enhanced response handler
121
- response_task = asyncio.create_task(self._handle_responses())
122
- self.background_tasks.add(response_task)
123
- response_task.add_done_callback(self.background_tasks.discard)
124
-
125
  except Exception as e:
126
  print(f"❌ Failed to connect to GenAI: {e}")
127
  self.connected = False
128
-
129
  async def _handle_responses(self):
130
- """Enhanced response handler with conversation history"""
131
  try:
132
- async for response in self.session.receive():
133
- if not self.connected:
134
- break
135
-
136
  try:
137
- # Handle audio responses (premium feature)
138
- if hasattr(response, 'data') and response.data:
139
- audio_array = np.frombuffer(response.data, dtype=np.int16)
140
- if len(audio_array) > 0:
141
- audio_array = audio_array.reshape(1, -1)
142
- await self.audio_queue.put(audio_array)
143
-
144
- # Handle text responses with conversation history
145
- if hasattr(response, 'text') and response.text:
146
- print(f"πŸ€– AI: {response.text}")
147
-
148
- # Add to conversation history
149
- self.conversation_history.append({
150
- "timestamp": time.time(),
151
- "type": "ai_response",
152
- "content": response.text
153
- })
154
-
155
- # Queue for frontend delivery
156
- await self.text_queue.put(response.text)
157
-
158
- # Handle structured responses (premium)
159
- if hasattr(response, 'server_content') and response.server_content:
160
- if hasattr(response.server_content, 'model_turn'):
161
- model_turn = response.server_content.model_turn
162
- if hasattr(model_turn, 'parts'):
163
- for part in model_turn.parts:
164
- if hasattr(part, 'text') and part.text:
165
- print(f"πŸ€– AI: {part.text}")
166
- await self.text_queue.put(part.text)
167
-
168
- except Exception as e:
169
- print(f"⚠️ Response processing error: {e}")
170
-
 
 
 
 
 
 
171
  except Exception as e:
172
- print(f"❌ Response handler error: {e}")
173
-
174
  async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]):
175
- """PREMIUM: Enhanced audio processing with voice activity detection"""
176
  if not self.connected or not self.session:
177
  return
178
-
179
  try:
180
  _, audio_np = frame
181
-
182
- # PREMIUM: Voice activity detection
183
- audio_level = np.abs(audio_np.astype(np.float32)).mean()
184
- self.last_audio_level = audio_level
185
-
186
- # Filter out silence and background noise
187
- if audio_level < self.voice_activity_threshold:
188
- self.consecutive_silent_frames += 1
189
- if self.consecutive_silent_frames < self.max_silent_frames:
190
- return # Skip silent frames
191
- else:
192
- self.consecutive_silent_frames = 0
193
-
194
- # Convert and send audio
195
  audio_bytes = audio_np.tobytes()
196
-
197
- # PREMIUM: Send with metadata
198
  await self.session.send_realtime_input(
199
  input=types.Blob(
200
- data=audio_bytes,
201
  mime_type="audio/pcm;rate=16000"
202
  )
203
  )
204
-
205
- # Track user interaction
206
- self.conversation_history.append({
207
- "timestamp": time.time(),
208
- "type": "user_audio",
209
- "audio_level": float(audio_level)
210
- })
211
-
212
  except Exception as e:
213
  print(f"❌ Error sending audio: {e}")
214
-
215
  async def video_receive(self, frame: npt.NDArray[np.float32]):
216
- """PREMIUM: Enhanced screen capture with adaptive throttling"""
217
  if not self.connected or not self.session:
218
  return
219
-
220
  try:
221
- # PREMIUM: Adaptive frame throttling based on activity
222
  current_time = time.time()
223
-
224
- # Adaptive interval based on user activity
225
- if hasattr(self, 'last_audio_level') and self.last_audio_level > 0.05:
226
- # More frequent updates during active conversation
227
- adaptive_interval = self.frame_interval * 0.5
228
- else:
229
- # Standard interval during quiet periods
230
- adaptive_interval = self.frame_interval
231
-
232
- if current_time - self.last_frame_time < adaptive_interval:
233
  return
234
-
235
  self.last_frame_time = current_time
236
-
237
- # PREMIUM: Enhanced frame processing
238
  if frame.dtype == np.float32:
 
239
  frame_uint8 = (frame * 255).astype(np.uint8)
240
  else:
241
  frame_uint8 = frame.astype(np.uint8)
242
-
243
- # Validate frame
244
  if frame_uint8.size == 0 or frame_uint8.shape[0] == 0 or frame_uint8.shape[1] == 0:
245
  return
246
-
247
- # PREMIUM: Adaptive quality encoding
248
- quality = 85 if self.adaptive_quality and self.last_audio_level > 0.02 else 75
249
-
250
  try:
251
- success, jpg_bytes = cv2.imencode('.jpg', frame_uint8, [cv2.IMWRITE_JPEG_QUALITY, quality])
252
  if not success:
253
  return
254
  except cv2.error:
 
255
  return
256
-
257
- # Send enhanced frame data
258
  await self.session.send_realtime_input(
259
  input=types.Blob(
260
- data=jpg_bytes.tobytes(),
261
  mime_type="image/jpeg"
262
  )
263
  )
264
-
265
- # Track screen activity
266
- self.conversation_history.append({
267
- "timestamp": time.time(),
268
- "type": "screen_frame",
269
- "quality": quality,
270
- "size": len(jpg_bytes)
271
- })
272
-
273
  except Exception as e:
274
  print(f"❌ Error sending video frame: {e}")
275
-
276
  async def emit(self):
277
- """PREMIUM: Enhanced audio emission with queue management"""
278
  try:
279
  audio_chunk = self.audio_queue.get_nowait()
280
  return (24000, audio_chunk)
281
  except asyncio.QueueEmpty:
282
  return None
283
-
284
- async def get_latest_text(self):
285
- """PREMIUM: Get latest text response from AI"""
286
- try:
287
- text = self.text_queue.get_nowait()
288
- return text
289
- except asyncio.QueueEmpty:
290
- return None
291
-
292
  def copy(self):
293
- """Enhanced copy method with state preservation"""
 
294
  new_instance = RealTimeScreenAssistant()
295
  new_instance.frame_interval = self.frame_interval
296
- new_instance.voice_activity_threshold = self.voice_activity_threshold
297
- new_instance.adaptive_quality = self.adaptive_quality
298
  return new_instance
299
-
300
  async def video_emit(self):
301
- """Video emit method for FastRTC compatibility"""
 
302
  return None
303
-
304
  async def shutdown(self):
305
- """PREMIUM: Enhanced shutdown with complete cleanup"""
306
  self.connected = False
307
-
308
- # Cancel all background tasks
309
- for task in self.background_tasks.copy():
310
- if not task.done():
311
- task.cancel()
312
-
313
- # Wait for task cleanup
314
- if self.background_tasks:
315
- await asyncio.gather(*self.background_tasks, return_exceptions=True)
316
- self.background_tasks.clear()
317
-
318
- # Clean up queues
319
- while not self.audio_queue.empty():
320
  try:
321
- self.audio_queue.get_nowait()
322
- except asyncio.QueueEmpty:
323
- break
324
-
325
- while not self.text_queue.empty():
326
- try:
327
- self.text_queue.get_nowait()
328
- except asyncio.QueueEmpty:
329
- break
330
-
331
- # Clear conversation history
332
- self.conversation_history.clear()
333
-
334
- # Close session
335
  if self.session:
336
  try:
337
- await self.session.close()
 
 
 
 
338
  print("πŸ”΄ Disconnected from GenAI Live API")
339
  except Exception as e:
340
  print(f"❌ Error during shutdown: {e}")
341
-
342
  self.session = None
 
 
343
 
344
- # Global state for premium app
345
- app_state = {
346
- "stream": None,
347
- "handler": None,
348
- "connected": False,
349
- "last_status": "Ready to connect",
350
- "stats": {"audio_sent": 0, "frames_sent": 0, "responses_received": 0}
351
- }
352
 
353
  def initialize_real_time_assistant():
354
- """PREMIUM: Enhanced stream initialization"""
355
  try:
 
356
  handler = RealTimeScreenAssistant()
357
  app_state["handler"] = handler
358
-
359
- # PREMIUM: Enhanced stream configuration
360
  stream = Stream(
361
- handler=ReplyOnPause(handler), # Voice activity detection
362
- modality="audio-video",
363
  mode="send-receive",
364
  rtc_configuration=get_cloudflare_turn_credentials_async,
365
- time_limit=300, # 5 minutes - real-time optimized
366
- ui_args={
367
- "title": "Premium Real-Time Assistant",
368
- "subtitle": "Audio-Video Streaming with Gemini 2.0",
369
- "hide_title": False
370
- }
371
  )
372
-
373
  app_state["stream"] = stream
374
  return stream
375
-
376
  except Exception as e:
377
  print(f"❌ Error creating stream: {e}")
378
  return None
379
 
380
- async def handle_connect_async():
381
- """PREMIUM: Enhanced async connection handler"""
 
382
  current_api_key = os.getenv("GEMINI_API_KEY", "")
383
  if not current_api_key:
384
  return "❌ Please set GEMINI_API_KEY environment variable"
385
-
386
  if app_state["connected"]:
387
  return "βœ… Already connected - session is active"
 
 
 
388
 
389
- try:
390
- if app_state["handler"]:
391
- await app_state["handler"].start_up()
392
- app_state["connected"] = True
393
- app_state["last_status"] = "Connected to GenAI Live API"
394
- return "βœ… Connected to GenAI Live API - Ready for real-time interaction!"
395
- else:
396
- return "❌ Handler not initialized"
397
- except Exception as e:
398
- app_state["connected"] = False
399
- return f"❌ Connection failed: {str(e)}"
400
-
401
- def handle_connect():
402
- """Sync wrapper for connection"""
403
- app_state["connected"] = True # Optimistic update for UI
404
- app_state["last_status"] = "Initiating connection..."
405
-
406
- # Start async connection
407
- asyncio.create_task(handle_connect_async())
408
- return "πŸ”„ Initiating connection to GenAI Live API..."
409
-
410
- async def handle_disconnect_async():
411
- """PREMIUM: Enhanced async disconnect handler"""
412
  if app_state["handler"] and app_state["connected"]:
413
- try:
414
- await app_state["handler"].shutdown()
415
- app_state["connected"] = False
416
- app_state["handler"] = None
417
- app_state["last_status"] = "Disconnected"
418
- return "πŸ”΄ Disconnected from AI assistant"
419
- except Exception as e:
420
- return f"❌ Error during disconnect: {str(e)}"
421
  return "Already disconnected"
422
 
423
- def handle_disconnect():
424
- """Sync wrapper for disconnect"""
425
- app_state["connected"] = False # Immediate update for UI
426
-
427
- # Start async disconnect
428
- asyncio.create_task(handle_disconnect_async())
429
- return "πŸ”„ Disconnecting from AI assistant..."
430
-
431
- def get_connection_status():
432
- """PREMIUM: Get detailed connection status"""
433
- if app_state["connected"]:
434
- stats = app_state["stats"]
435
- return f"🟒 Connected | Audio: {stats['audio_sent']} | Frames: {stats['frames_sent']} | Responses: {stats['responses_received']}"
436
- else:
437
- return f"πŸ”΄ Disconnected | Status: {app_state['last_status']}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
  def create_interface():
440
- """PREMIUM: Enhanced interface with complete real-time integration"""
441
- # Initialize premium stream
 
442
  stream = initialize_real_time_assistant()
443
-
444
  with gr.Blocks(
445
- title="Real-Time Screen Assistant - Premium Edition",
446
  theme=gr.themes.Soft()
447
  ) as demo:
448
-
449
- gr.Markdown("# πŸš€ Real-Time Screen Assistant - Premium Edition")
450
  gr.Markdown("""
451
- **🎯 PREMIUM AI with complete real-time frontend integration!**
452
-
453
- **Real-time Frontend Integration Features:**
454
- βœ… **Continuous audio flow** - Voice activity detection, noise filtering
455
- βœ… **Model audio output** - AI voice responses with queue management
456
- βœ… **Screen data streaming** - Adaptive capture with intelligent throttling
457
- βœ… **Text response delivery** - Real-time text display with conversation history
458
-
459
- **Enhanced Premium Features:**
460
- - 🧠 Enhanced GenAI configuration with full modalities
461
- - πŸŽ™οΈ Intelligent voice activity detection
462
- - πŸ“Ή Adaptive screen capture (300s real-time timeout)
463
- - πŸ”„ Background task management with cleanup
464
- - πŸ“Š Performance monitoring and optimization
465
- - πŸ›‘οΈ Enhanced error handling and recovery
466
  """)
467
-
468
- # PREMIUM: Enhanced status display
469
- with gr.Row():
470
- status_display = gr.Textbox(
471
- label="πŸ”΄ Connection Status",
472
- value="Ready to connect - Premium features enabled",
473
- interactive=False
474
- )
475
- stats_display = gr.Textbox(
476
- label="πŸ“Š Performance Stats",
477
- value="Audio: 0 | Frames: 0 | Responses: 0",
478
- interactive=False
479
- )
480
-
481
- # PREMIUM: Enhanced control panel
482
  with gr.Row():
483
- connect_btn = gr.Button("πŸ”— Connect (Premium)", variant="primary")
 
 
484
  disconnect_btn = gr.Button("πŸ”΄ Disconnect", variant="stop")
485
-
486
- with gr.Row():
487
- mic_test_btn = gr.Button("πŸŽ™οΈ Test Microphone", variant="secondary")
488
- screen_share_btn = gr.Button("πŸ–₯️ Share Screen", variant="secondary")
489
-
490
- # PREMIUM: Real-time streaming interface
491
- gr.Markdown("### πŸ“‘ Premium Real-Time Stream")
492
-
493
  if stream:
494
- # Create streaming interface with enhanced configuration
495
- audio_stream = gr.Audio(
496
- streaming=True,
497
- autoplay=False,
498
- show_download_button=False,
499
- label="πŸŽ™οΈ Microphone Input (Voice Activity Detection)",
500
- interactive=True
501
- )
502
-
503
- video_stream = gr.Image(
504
- streaming=True,
505
- label="πŸ–₯️ Screen Capture (Adaptive Quality)",
506
- interactive=True
507
- )
508
-
509
- # PREMIUM: Connect streaming handlers
510
- audio_stream.stream(
511
- fn=lambda audio: app_state["handler"].receive(audio) if app_state["handler"] and app_state["connected"] else None,
512
- inputs=[audio_stream],
513
- outputs=[],
514
- time_limit=300, # Real-time optimized
515
- concurrency_limit=5
516
- )
517
-
518
- video_stream.stream(
519
- fn=lambda frame: app_state["handler"].video_receive(frame) if app_state["handler"] and app_state["connected"] else None,
520
- inputs=[video_stream],
521
- outputs=[],
522
- time_limit=300, # Real-time optimized
523
- concurrency_limit=3
524
- )
525
-
526
- # PREMIUM: AI response display
527
- ai_response_display = gr.Textbox(
528
- label="πŸ€– AI Response Stream",
529
- value="AI responses will appear here...",
530
- interactive=False,
531
- max_lines=10
532
- )
533
-
534
- # PREMIUM: Audio output
535
- ai_audio_output = gr.Audio(
536
- label="πŸ”Š AI Voice Response",
537
- autoplay=True,
538
- streaming=True
539
- )
540
-
541
- # Connect AI response handlers
542
- ai_audio_output.stream(
543
- fn=lambda: app_state["handler"].emit() if app_state["handler"] and app_state["connected"] else None,
544
- inputs=[],
545
- outputs=[ai_audio_output],
546
- time_limit=300
547
- )
548
-
549
  else:
550
- gr.HTML("<div>⚠️ Premium stream initialization failed - Check console for errors</div>")
551
-
552
- # PREMIUM: Enhanced instructions
553
- with gr.Accordion("πŸ“‹ Premium Instructions", open=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
  gr.Markdown("""
555
- **How to use the Premium Real-Time Assistant:**
556
-
557
- 1. **Connect**: Click "Connect (Premium)" to start enhanced AI session
558
- 2. **Permissions**: Allow microphone and camera access when prompted
559
- 3. **Voice Interaction**: Speak naturally - voice activity detection filters noise
560
- 4. **Screen Sharing**: Click "Share Screen" for continuous screen analysis
561
- 5. **Real-time Responses**: Receive both voice and text responses immediately
562
- 6. **Monitor Performance**: Check stats display for real-time metrics
563
-
564
- **Premium Features Active:**
565
- - βœ… **Continuous Audio Flow**: Voice activity detection with noise filtering
566
- - βœ… **Model Audio Output**: AI voice responses with smart queue management
567
- - βœ… **Screen Data Streaming**: Adaptive capture with 1 FPS optimization
568
- - βœ… **Text Response Delivery**: Real-time text with conversation history
569
- - βœ… **Background Task Management**: Proper async task handling and cleanup
570
- - βœ… **Enhanced Error Recovery**: Robust connection management
 
571
  """)
572
-
573
- # PREMIUM: Technical details
574
- with gr.Accordion("πŸ”§ Premium Technical Features", open=False):
575
  gr.Markdown("""
576
- **Real-Time Frontend Integration Implementation:**
577
-
578
- **1. Continuous Audio Flow (User β†’ Model):**
579
- ```python
580
- # Voice activity detection with threshold filtering
581
- audio_level = np.abs(audio_np.astype(np.float32)).mean()
582
- if audio_level < voice_activity_threshold:
583
- return # Filter silence
584
-
585
- # Enhanced send with metadata
586
- await session.send_realtime_input(input=types.Blob(...))
587
- ```
588
-
589
- **2. Model Audio Output (Model β†’ User):**
590
- ```python
591
- # AI response processing with queue management
592
- audio_array = np.frombuffer(response.data, dtype=np.int16)
593
- await audio_queue.put(audio_array.reshape(1, -1))
594
- ```
595
-
596
- **3. Screen Data Streaming (Screen β†’ Model):**
597
- ```python
598
- # Adaptive throttling based on activity
599
- adaptive_interval = frame_interval * (0.5 if active else 1.0)
600
- # Quality optimization: 85% for active, 75% for quiet
601
- ```
602
-
603
- **4. Text Response Delivery (System β†’ User):**
604
- ```python
605
- # Conversation history with timestamps
606
- conversation_history.append({
607
- "timestamp": time.time(),
608
- "type": "ai_response",
609
- "content": response.text
610
- })
611
- ```
612
-
613
- **Premium Optimizations:**
614
- - Background task management with proper cleanup
615
- - Enhanced error handling and recovery
616
- - Performance monitoring and adaptive quality
617
- - 300s timeout optimized for real-time behavior
618
  """)
619
-
620
- # Wire up premium controls
621
  connect_btn.click(
622
  fn=handle_connect,
623
  outputs=[status_display]
624
  )
625
-
 
 
 
 
 
 
 
 
 
 
 
 
626
  disconnect_btn.click(
627
  fn=handle_disconnect,
628
  outputs=[status_display]
629
  )
630
-
631
- # Initial load of connection status
632
- demo.load(
633
- fn=get_connection_status,
634
- outputs=[stats_display]
635
- )
636
-
637
  return demo
638
 
639
  # Main execution
640
  if __name__ == "__main__":
641
- print("πŸš€ Real-Time Screen Assistant - PREMIUM EDITION")
642
- print("=" * 60)
643
- print("βœ… Complete real-time frontend integration:")
644
- print(" 1. Continuous audio flow (user β†’ model)")
645
- print(" 2. Model audio output (model β†’ user)")
646
- print(" 3. Screen data streaming (screen β†’ model)")
647
- print(" 4. Text response delivery (system β†’ user)")
648
- print("βœ… Enhanced features:")
649
- print(" - Voice activity detection with noise filtering")
650
- print(" - Adaptive screen capture with quality optimization")
651
- print(" - Background task management with cleanup")
652
- print(" - Enhanced error handling and recovery")
653
- print(" - 300s timeout optimized for real-time behavior")
654
-
655
  if not API_KEY:
656
  print("\n⚠️ No GEMINI_API_KEY environment variable found")
657
  print("Please set your Google AI API key:")
658
  print("export GEMINI_API_KEY='your-api-key-here'")
659
  else:
660
- print(f"\nβœ… API key configured (Premium Mode)")
661
-
662
- print("\nπŸš€ Starting Premium Real-Time Assistant...")
663
-
664
  try:
665
  demo = create_interface()
666
  demo.launch(
@@ -671,4 +527,4 @@ if __name__ == "__main__":
671
  )
672
  except Exception as e:
673
  print(f"❌ Failed to launch: {e}")
674
- print("Ensure all dependencies are installed: pip install -r requirements.txt")
 
1
+ """
2
+ Real-Time Screen Assistant - Refactored for Google GenAI Live API + FastRTC
 
 
 
 
 
3
 
4
+ This application transforms the original screenshot analyzer into a real-time
5
+ screen sharing assistant with voice interaction, following the refactoring
6
+ instructions for live streaming capabilities.
 
 
 
 
 
 
7
  """
8
 
 
9
  import os
10
+ import asyncio
11
  import time
 
 
 
 
12
  import numpy as np
13
  import numpy.typing as npt
14
+ import cv2
15
+ import gradio as gr
16
+ from fastrtc import Stream, AsyncAudioVideoStreamHandler, get_cloudflare_turn_credentials_async, ReplyOnPause
17
  from google import genai
18
  from google.genai import types
19
 
 
21
  API_KEY = os.getenv("GEMINI_API_KEY", "")
22
 
23
  class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
 
 
 
 
 
 
 
 
 
24
  """
25
+ Real-time screen assistant implementing the refactoring instructions.
26
+
27
+ Features:
28
+ - Google GenAI Live API integration
29
+ - Real-time audio/video streaming via FastRTC
30
+ - Voice activity detection with ReplyOnPause
31
+ - Intelligent frame sampling for screen sharing
32
+ - Cloudflare TURN server support for HF Spaces
33
+ """
34
+
35
  def __init__(self):
36
  super().__init__(
37
+ expected_layout="mono",
38
+ output_sample_rate=24000,
39
  input_sample_rate=16000
40
  )
41
  self.session = None
42
  self.last_frame_time = 0
43
  self.audio_queue = asyncio.Queue()
 
44
  self.connected = False
45
+ self.frame_interval = 1.0 # 1 FPS as per instructions
46
+
 
 
 
 
 
 
 
 
 
 
 
 
47
  async def start_up(self):
48
+ """Initialize Google GenAI Live session as per Task 8-10"""
49
  try:
50
+ # Re-check environment variable in case it was set after import
51
  current_api_key = os.getenv("GEMINI_API_KEY", "")
52
  if not current_api_key:
53
  print("❌ No GEMINI_API_KEY found in environment")
54
  return
55
+
56
+ # Initialize client with v1alpha API (Task 8)
57
  client = genai.Client(
58
  api_key=current_api_key,
59
  http_options={"api_version": "v1alpha"}
60
  )
61
+
62
+ # Configure live session (Task 9) - minimal working config
63
+ from google.genai.types import LiveConnectConfig
64
+
65
+ # Start with minimal config to avoid WebSocket errors
66
+ config = LiveConnectConfig(
67
+ system_instruction=(
68
+ "You are a helpful real-time assistant who watches the user's screen and provides "
69
+ "guidance on using software. Give clear, step-by-step instructions based on what "
70
+ "you see and hear. Be proactive in offering assistance."
71
+ )
72
+ )
73
+
74
+ # Connect to Live API (Task 10) - using async context manager
75
+ self.session_context = client.aio.live.connect(
76
+ model="gemini-2.0-flash-live-001",
 
 
 
 
 
 
 
 
 
 
 
 
77
  config=config
78
  )
79
+ self.session = await self.session_context.__aenter__()
80
+
81
  self.connected = True
82
+ print("βœ… Connected to Google GenAI Live API")
83
+
84
+ # Start response handler (Task 13)
85
+ self.response_task = asyncio.create_task(self._handle_responses())
86
+
 
 
87
  except Exception as e:
88
  print(f"❌ Failed to connect to GenAI: {e}")
89
  self.connected = False
90
+
91
  async def _handle_responses(self):
92
+ """Handle AI responses as per Task 12-13"""
93
  try:
94
+ # The Google GenAI Live API provides an async iterator through session.receive()
95
+ # We need to handle this as a streaming response
96
+ while self.connected and self.session:
 
97
  try:
98
+ # Get the next message from the session
99
+ response_stream = self.session.receive()
100
+
101
+ # Check if this is an async iterator or needs to be awaited
102
+ if hasattr(response_stream, '__aiter__'):
103
+ # It's an async iterator
104
+ async for msg in response_stream:
105
+ if not self.connected:
106
+ break
107
+
108
+ if msg.data: # Audio response
109
+ # Convert to numpy for FastRTC (Task 13)
110
+ audio_array = np.frombuffer(msg.data, dtype=np.int16)
111
+ if len(audio_array) > 0:
112
+ audio_array = audio_array.reshape(1, -1)
113
+ await self.audio_queue.put(audio_array)
114
+
115
+ if msg.text: # Text response
116
+ print(f"πŸ€– AI: {msg.text}")
117
+ else:
118
+ # It's a single response that needs to be awaited
119
+ msg = await response_stream
120
+ if msg:
121
+ if msg.data: # Audio response
122
+ audio_array = np.frombuffer(msg.data, dtype=np.int16)
123
+ if len(audio_array) > 0:
124
+ audio_array = audio_array.reshape(1, -1)
125
+ await self.audio_queue.put(audio_array)
126
+
127
+ if msg.text: # Text response
128
+ print(f"πŸ€– AI: {msg.text}")
129
+
130
+ except Exception as inner_e:
131
+ if "connection" in str(inner_e).lower() or "closed" in str(inner_e).lower():
132
+ print("πŸ”΄ Connection closed, stopping response handler")
133
+ break
134
+ else:
135
+ print(f"⚠️ Response handling error: {inner_e}")
136
+ await asyncio.sleep(0.1) # Brief pause before retry
137
+
138
  except Exception as e:
139
+ print(f"❌ Error handling AI responses: {e}")
140
+
141
  async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]):
142
+ """Handle microphone audio (Task 11)"""
143
  if not self.connected or not self.session:
144
  return
145
+
146
  try:
147
  _, audio_np = frame
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  audio_bytes = audio_np.tobytes()
149
+
150
+ # Send audio to GenAI Live API using new non-deprecated method
151
  await self.session.send_realtime_input(
152
  input=types.Blob(
153
+ data=audio_bytes,
154
  mime_type="audio/pcm;rate=16000"
155
  )
156
  )
 
 
 
 
 
 
 
 
157
  except Exception as e:
158
  print(f"❌ Error sending audio: {e}")
159
+
160
  async def video_receive(self, frame: npt.NDArray[np.float32]):
161
+ """Handle screen video frames (Task 11-12)"""
162
  if not self.connected or not self.session:
163
  return
164
+
165
  try:
166
+ # Throttle to 1 FPS as per instructions
167
  current_time = time.time()
168
+ if current_time - self.last_frame_time < self.frame_interval:
 
 
 
 
 
 
 
 
 
169
  return
170
+
171
  self.last_frame_time = current_time
172
+
173
+ # Convert float32 frame to uint8 for JPEG encoding
174
  if frame.dtype == np.float32:
175
+ # Assuming frame is in range [0, 1], convert to [0, 255]
176
  frame_uint8 = (frame * 255).astype(np.uint8)
177
  else:
178
  frame_uint8 = frame.astype(np.uint8)
179
+
180
+ # Check for empty frame before encoding
181
  if frame_uint8.size == 0 or frame_uint8.shape[0] == 0 or frame_uint8.shape[1] == 0:
182
  return
183
+
184
+ # Encode as JPEG (Task 12)
 
 
185
  try:
186
+ success, jpg_bytes = cv2.imencode('.jpg', frame_uint8, [cv2.IMWRITE_JPEG_QUALITY, 80])
187
  if not success:
188
  return
189
  except cv2.error:
190
+ # Handle OpenCV encoding errors gracefully
191
  return
192
+
193
+ # Send to GenAI using new non-deprecated method
194
  await self.session.send_realtime_input(
195
  input=types.Blob(
196
+ data=jpg_bytes.tobytes(),
197
  mime_type="image/jpeg"
198
  )
199
  )
200
+
 
 
 
 
 
 
 
 
201
  except Exception as e:
202
  print(f"❌ Error sending video frame: {e}")
203
+
204
  async def emit(self):
205
+ """Emit audio back to user (Task 13)"""
206
  try:
207
  audio_chunk = self.audio_queue.get_nowait()
208
  return (24000, audio_chunk)
209
  except asyncio.QueueEmpty:
210
  return None
211
+
 
 
 
 
 
 
 
 
212
  def copy(self):
213
+ """Copy method required by FastRTC AsyncAudioVideoStreamHandler"""
214
+ # Return a new instance with same configuration
215
  new_instance = RealTimeScreenAssistant()
216
  new_instance.frame_interval = self.frame_interval
 
 
217
  return new_instance
218
+
219
  async def video_emit(self):
220
+ """Video emit method required by FastRTC AsyncAudioVideoStreamHandler"""
221
+ # For this use case, we don't emit video back to user
222
  return None
223
+
224
  async def shutdown(self):
225
+ """Clean shutdown (Task 17)"""
226
  self.connected = False
227
+
228
+ # Cancel response handler task if it exists
229
+ if hasattr(self, 'response_task') and not self.response_task.done():
230
+ self.response_task.cancel()
 
 
 
 
 
 
 
 
 
231
  try:
232
+ await self.response_task
233
+ except asyncio.CancelledError:
234
+ pass
235
+
 
 
 
 
 
 
 
 
 
 
236
  if self.session:
237
  try:
238
+ # Properly close the session using context manager
239
+ if hasattr(self, 'session_context'):
240
+ await self.session_context.__aexit__(None, None, None)
241
+ else:
242
+ await self.session.close()
243
  print("πŸ”΄ Disconnected from GenAI Live API")
244
  except Exception as e:
245
  print(f"❌ Error during shutdown: {e}")
246
+
247
  self.session = None
248
+ if hasattr(self, 'session_context'):
249
+ self.session_context = None
250
 
251
+ # Global state
252
+ app_state = {"stream": None, "handler": None, "connected": False}
 
 
 
 
 
 
253
 
254
  def initialize_real_time_assistant():
255
+ """Initialize the real-time assistant (Task 26-29)"""
256
  try:
257
+ # Create handler
258
  handler = RealTimeScreenAssistant()
259
  app_state["handler"] = handler
260
+
261
+ # Create stream with Cloudflare TURN (Task 22-23)
262
  stream = Stream(
263
+ handler=ReplyOnPause(handler), # Voice activity detection (Task 3)
264
+ modality="audio-video",
265
  mode="send-receive",
266
  rtc_configuration=get_cloudflare_turn_credentials_async,
267
+ time_limit=300 # 5 minute limit for Spaces
 
 
 
 
 
268
  )
269
+
270
  app_state["stream"] = stream
271
  return stream
272
+
273
  except Exception as e:
274
  print(f"❌ Error creating stream: {e}")
275
  return None
276
 
277
+ def handle_connect():
278
+ """Connect button handler (Task 16)"""
279
+ # Re-check environment variable in case it was set after import
280
  current_api_key = os.getenv("GEMINI_API_KEY", "")
281
  if not current_api_key:
282
  return "❌ Please set GEMINI_API_KEY environment variable"
283
+
284
  if app_state["connected"]:
285
  return "βœ… Already connected - session is active"
286
+
287
+ app_state["connected"] = True
288
+ return "βœ… Connecting... Please allow microphone and camera permissions"
289
 
290
+ def handle_disconnect():
291
+ """Disconnect button handler (Task 17)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  if app_state["handler"] and app_state["connected"]:
293
+ asyncio.run(app_state["handler"].shutdown())
294
+ app_state["connected"] = False
295
+ return "πŸ”΄ Disconnected from AI assistant"
296
+
 
 
 
 
297
  return "Already disconnected"
298
 
299
+ # Screen sharing JavaScript - Fixed syntax for HF Spaces
300
+ screen_share_js = '''
301
+ (async function() {
302
+ try {
303
+ if (!navigator.mediaDevices || !navigator.mediaDevices.getDisplayMedia) {
304
+ return "❌ Screen sharing not supported in this browser";
305
+ }
306
+
307
+ const stream = await navigator.mediaDevices.getDisplayMedia({
308
+ video: {
309
+ width: { ideal: 1920 },
310
+ height: { ideal: 1080 }
311
+ },
312
+ audio: false
313
+ });
314
+
315
+ // Find the video element from FastRTC
316
+ const videos = document.querySelectorAll('video');
317
+ let targetVideo = null;
318
+
319
+ for (let video of videos) {
320
+ if (video.srcObject && video.srcObject.getVideoTracks().length > 0) {
321
+ targetVideo = video;
322
+ break;
323
+ }
324
+ }
325
+
326
+ if (targetVideo && targetVideo.srcObject) {
327
+ // Replace the camera track with screen track
328
+ const screenTrack = stream.getVideoTracks()[0];
329
+ const sender = targetVideo.srcObject.getVideoTracks()[0];
330
+
331
+ // Remove old track and add screen track
332
+ targetVideo.srcObject.removeTrack(sender);
333
+ targetVideo.srcObject.addTrack(screenTrack);
334
+
335
+ screenTrack.onended = () => {
336
+ console.log("Screen sharing ended");
337
+ };
338
+
339
+ return "πŸ–₯️ Screen sharing started successfully!";
340
+ } else {
341
+ return "❌ Could not find video stream to replace";
342
+ }
343
+
344
+ } catch (error) {
345
+ console.error("Screen sharing error:", error);
346
+ if (error.name === "NotAllowedError") {
347
+ return "❌ Screen sharing permission denied - please allow screen access";
348
+ } else if (error.name === "NotSupportedError") {
349
+ return "❌ Screen sharing not supported in this environment";
350
+ } else {
351
+ return "❌ Screen sharing failed: " + error.message;
352
+ }
353
+ }
354
+ })()'''
355
 
356
  def create_interface():
357
+ """Create main interface (Task 26-30)"""
358
+
359
+ # Initialize stream
360
  stream = initialize_real_time_assistant()
361
+
362
  with gr.Blocks(
363
+ title="Real-Time Screen Assistant",
364
  theme=gr.themes.Soft()
365
  ) as demo:
366
+
367
+ gr.Markdown("# πŸ–₯️ Real-Time Screen Assistant")
368
  gr.Markdown("""
369
+ **🎯 LIVE AI that sees your screen and provides real-time guidance!**
370
+
371
+ **How it works:**
372
+ 1. **Connect** - Links to Google's GenAI Live API for real-time AI processing
373
+ 2. **Share Screen** - AI can see exactly what you're doing on your screen
374
+ 3. **Voice Chat** - Talk naturally, AI responds with voice and sees everything
375
+ 4. **Get Help** - Real-time assistance with software, coding, troubleshooting
376
+
377
+ **Tech Stack:**
378
+ - 🧠 Google GenAI Live API (multimodal real-time AI)
379
+ - πŸ“Ή FastRTC (low-latency screen/audio streaming)
380
+ - πŸŽ™οΈ Voice activity detection
381
+ - 🌐 Cloudflare TURN servers (HF Spaces optimized)
 
 
382
  """)
383
+
384
+ # Status display
385
+ status_display = gr.Textbox(
386
+ label="Status",
387
+ value="Ready to connect - Click Connect to start real-time session",
388
+ interactive=False
389
+ )
390
+
391
+ # Control buttons (Task 3, 16-17)
 
 
 
 
 
 
392
  with gr.Row():
393
+ connect_btn = gr.Button("πŸ”— Connect", variant="primary")
394
+ mic_btn = gr.Button("πŸŽ™οΈ Test Microphone", variant="secondary")
395
+ screen_btn = gr.Button("πŸ–₯️ Show Your Screen", variant="secondary")
396
  disconnect_btn = gr.Button("πŸ”΄ Disconnect", variant="stop")
397
+
398
+ # Stream interface - FastRTC UI for microphone and video
399
+ gr.Markdown("### πŸ“‘ Live Audio/Video Stream")
 
 
 
 
 
400
  if stream:
401
+ # Mount the FastRTC stream UI - this provides microphone access
402
+ gr.HTML("""
403
+ <div id="fastrtc-container">
404
+ <p>πŸŽ™οΈ Microphone and video streaming handled by FastRTC</p>
405
+ <p>Click 'Test Microphone' and 'Show Your Screen' to activate</p>
406
+ </div>
407
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  else:
409
+ gr.HTML("<div>⚠️ Stream initialization failed - Check console for errors</div>")
410
+
411
+ # Microphone activation JavaScript
412
+ microphone_js = '''
413
+ (async function() {
414
+ try {
415
+ // Request microphone permission and start audio
416
+ const stream = await navigator.mediaDevices.getUserMedia({
417
+ audio: {
418
+ sampleRate: 16000,
419
+ channelCount: 1,
420
+ echoCancellation: true,
421
+ noiseSuppression: true
422
+ }
423
+ });
424
+ console.log("Microphone access granted");
425
+ return "πŸŽ™οΈ Microphone connected successfully";
426
+ } catch (error) {
427
+ console.error("Microphone error:", error);
428
+ if (error.name === "NotAllowedError") {
429
+ return "❌ Microphone permission denied - please allow microphone access";
430
+ } else {
431
+ return "❌ Microphone failed: " + error.message;
432
+ }
433
+ }
434
+ })()
435
+ '''
436
+
437
+ # Instructions (Task 1-3)
438
+ with gr.Accordion("πŸ“‹ Instructions", open=True):
439
  gr.Markdown("""
440
+ **How to use the real-time assistant:**
441
+
442
+ 1. **Connect**: Click Connect to start the AI session
443
+ 2. **Permissions**: Allow microphone and camera access
444
+ 3. **Show Screen**: Click "Show Your Screen" to share your screen
445
+ 4. **Voice Interaction**: Simply speak - the AI will respond
446
+ 5. **Real-time Guidance**: AI sees your screen and provides live help
447
+ 6. **Disconnect**: Click Disconnect when finished
448
+
449
+ **Features implemented from refactoring instructions:**
450
+ - βœ… FastRTC WebRTC streaming (Task 2)
451
+ - βœ… Google GenAI Live API integration (Task 7-15)
452
+ - βœ… Connect/Show Screen/Disconnect controls (Task 3, 16-17)
453
+ - βœ… Voice activity detection with ReplyOnPause (Task 3)
454
+ - βœ… Screen sharing via getDisplayMedia (Task 6)
455
+ - βœ… Real-time advice generation (Task 18-21)
456
+ - βœ… Cloudflare TURN for HF Spaces (Task 22-23)
457
  """)
458
+
459
+ # Privacy notice (Task 24-25)
460
+ with gr.Accordion("πŸ”’ Privacy & Security", open=False):
461
  gr.Markdown("""
462
+ **Privacy Notice:**
463
+ - Screen content and voice are processed by Google's AI services
464
+ - Data is transmitted securely via encrypted WebRTC connections
465
+ - No permanent storage - all processing is real-time
466
+ - You control what is shared and can disconnect anytime
467
+
468
+ **Technical Details:**
469
+ - Uses Google Gemini Live API for real-time multimodal processing
470
+ - FastRTC provides low-latency WebRTC streaming
471
+ - Cloudflare TURN servers ensure reliable connectivity on HF Spaces
472
+ - Voice activity detection prevents interruptions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  """)
474
+
475
+ # Wire up controls
476
  connect_btn.click(
477
  fn=handle_connect,
478
  outputs=[status_display]
479
  )
480
+
481
+ mic_btn.click(
482
+ fn=lambda: "πŸŽ™οΈ Testing microphone...",
483
+ outputs=[status_display],
484
+ js=microphone_js
485
+ )
486
+
487
+ screen_btn.click(
488
+ fn=lambda: "πŸ–₯️ Requesting screen share...",
489
+ outputs=[status_display],
490
+ js=screen_share_js
491
+ )
492
+
493
  disconnect_btn.click(
494
  fn=handle_disconnect,
495
  outputs=[status_display]
496
  )
497
+
 
 
 
 
 
 
498
  return demo
499
 
500
  # Main execution
501
  if __name__ == "__main__":
502
+ print("πŸ–₯️ Real-Time Screen Assistant")
503
+ print("=" * 50)
504
+ print("Refactored according to instructions for:")
505
+ print("- Google GenAI Live API integration")
506
+ print("- FastRTC real-time streaming")
507
+ print("- Voice activity detection")
508
+ print("- Screen sharing capabilities")
509
+ print("- Cloudflare TURN for HF Spaces")
510
+
 
 
 
 
 
511
  if not API_KEY:
512
  print("\n⚠️ No GEMINI_API_KEY environment variable found")
513
  print("Please set your Google AI API key:")
514
  print("export GEMINI_API_KEY='your-api-key-here'")
515
  else:
516
+ print(f"\nβœ… API key configured (length: {len(API_KEY)})")
517
+
518
+ print("\nπŸš€ Starting real-time assistant...")
519
+
520
  try:
521
  demo = create_interface()
522
  demo.launch(
 
527
  )
528
  except Exception as e:
529
  print(f"❌ Failed to launch: {e}")
530
+ print("Ensure all dependencies are installed: pip install -r requirements.txt")