arthrod commited on
Commit
dfcf7b9
Β·
verified Β·
1 Parent(s): 0a778fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -93
app.py CHANGED
@@ -1,19 +1,19 @@
1
- """
2
- Real-Time Screen Assistant - Refactored for Google GenAI Live API + FastRTC
3
-
4
  This application transforms the original screenshot analyzer into a real-time
5
- screen sharing assistant with voice interaction, following the refactoring
6
  instructions for live streaming capabilities.
7
  """
8
 
9
- import os
10
  import asyncio
 
11
  import time
12
- import numpy as np
13
- import numpy.typing as npt
14
  import cv2
15
  import gradio as gr
16
- from fastrtc import Stream, AsyncAudioVideoStreamHandler, get_cloudflare_turn_credentials_async, ReplyOnPause
 
 
17
  from google import genai
18
  from google.genai import types
19
 
@@ -21,9 +21,8 @@ from google.genai import types
21
  API_KEY = os.getenv("GEMINI_API_KEY", "")
22
 
23
  class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
24
- """
25
- Real-time screen assistant implementing the refactoring instructions.
26
-
27
  Features:
28
  - Google GenAI Live API integration
29
  - Real-time audio/video streaming via FastRTC
@@ -31,11 +30,11 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
31
  - Intelligent frame sampling for screen sharing
32
  - Cloudflare TURN server support for HF Spaces
33
  """
34
-
35
  def __init__(self):
36
  super().__init__(
37
- expected_layout="mono",
38
- output_sample_rate=24000,
39
  input_sample_rate=16000
40
  )
41
  self.session = None
@@ -43,7 +42,7 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
43
  self.audio_queue = asyncio.Queue()
44
  self.connected = False
45
  self.frame_interval = 1.0 # 1 FPS as per instructions
46
-
47
  async def start_up(self):
48
  """Initialize Google GenAI Live session as per Task 8-10"""
49
  try:
@@ -52,16 +51,16 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
52
  if not current_api_key:
53
  print("❌ No GEMINI_API_KEY found in environment")
54
  return
55
-
56
  # Initialize client with v1alpha API (Task 8)
57
  client = genai.Client(
58
  api_key=current_api_key,
59
  http_options={"api_version": "v1alpha"}
60
  )
61
-
62
  # Configure live session (Task 9) - minimal working config
63
  from google.genai.types import LiveConnectConfig
64
-
65
  # Start with minimal config to avoid WebSocket errors
66
  config = LiveConnectConfig(
67
  system_instruction=(
@@ -70,24 +69,24 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
70
  "you see and hear. Be proactive in offering assistance."
71
  )
72
  )
73
-
74
  # Connect to Live API (Task 10) - using async context manager
75
  self.session_context = client.aio.live.connect(
76
- model="gemini-2.0-flash-live-001",
77
  config=config
78
  )
79
  self.session = await self.session_context.__aenter__()
80
-
81
  self.connected = True
82
  print("βœ… Connected to Google GenAI Live API")
83
-
84
  # Start response handler (Task 13)
85
  self.response_task = asyncio.create_task(self._handle_responses())
86
-
87
  except Exception as e:
88
  print(f"❌ Failed to connect to GenAI: {e}")
89
  self.connected = False
90
-
91
  async def _handle_responses(self):
92
  """Handle AI responses as per Task 12-13"""
93
  try:
@@ -97,21 +96,21 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
97
  try:
98
  # Get the next message from the session
99
  response_stream = self.session.receive()
100
-
101
  # Check if this is an async iterator or needs to be awaited
102
  if hasattr(response_stream, '__aiter__'):
103
  # It's an async iterator
104
  async for msg in response_stream:
105
  if not self.connected:
106
  break
107
-
108
  if msg.data: # Audio response
109
  # Convert to numpy for FastRTC (Task 13)
110
  audio_array = np.frombuffer(msg.data, dtype=np.int16)
111
  if len(audio_array) > 0:
112
  audio_array = audio_array.reshape(1, -1)
113
  await self.audio_queue.put(audio_array)
114
-
115
  if msg.text: # Text response
116
  print(f"πŸ€– AI: {msg.text}")
117
  else:
@@ -123,10 +122,10 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
123
  if len(audio_array) > 0:
124
  audio_array = audio_array.reshape(1, -1)
125
  await self.audio_queue.put(audio_array)
126
-
127
  if msg.text: # Text response
128
  print(f"πŸ€– AI: {msg.text}")
129
-
130
  except Exception as inner_e:
131
  if "connection" in str(inner_e).lower() or "closed" in str(inner_e).lower():
132
  print("πŸ”΄ Connection closed, stopping response handler")
@@ -134,53 +133,53 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
134
  else:
135
  print(f"⚠️ Response handling error: {inner_e}")
136
  await asyncio.sleep(0.1) # Brief pause before retry
137
-
138
  except Exception as e:
139
  print(f"❌ Error handling AI responses: {e}")
140
-
141
  async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]):
142
  """Handle microphone audio (Task 11)"""
143
  if not self.connected or not self.session:
144
  return
145
-
146
  try:
147
  _, audio_np = frame
148
  audio_bytes = audio_np.tobytes()
149
-
150
  # Send audio to GenAI Live API using new non-deprecated method
151
  await self.session.send_realtime_input(
152
  input=types.Blob(
153
- data=audio_bytes,
154
  mime_type="audio/pcm;rate=16000"
155
  )
156
  )
157
  except Exception as e:
158
  print(f"❌ Error sending audio: {e}")
159
-
160
  async def video_receive(self, frame: npt.NDArray[np.float32]):
161
  """Handle screen video frames (Task 11-12)"""
162
  if not self.connected or not self.session:
163
  return
164
-
165
  try:
166
  # Throttle to 1 FPS as per instructions
167
  current_time = time.time()
168
  if current_time - self.last_frame_time < self.frame_interval:
169
  return
170
-
171
  self.last_frame_time = current_time
172
-
173
  # Convert float32 frame to uint8 for JPEG encoding
174
  if frame.dtype == np.float32:
175
  # Assuming frame is in range [0, 1], convert to [0, 255]
176
  frame_uint8 = (frame * 255).astype(np.uint8)
177
  else:
178
  frame_uint8 = frame.astype(np.uint8)
179
-
180
  # Check for empty frame before encoding
181
  if frame_uint8.size == 0 or frame_uint8.shape[0] == 0 or frame_uint8.shape[1] == 0:
182
  return
183
-
184
  # Encode as JPEG (Task 12)
185
  try:
186
  success, jpg_bytes = cv2.imencode('.jpg', frame_uint8, [cv2.IMWRITE_JPEG_QUALITY, 80])
@@ -189,18 +188,18 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
189
  except cv2.error:
190
  # Handle OpenCV encoding errors gracefully
191
  return
192
-
193
  # Send to GenAI using new non-deprecated method
194
  await self.session.send_realtime_input(
195
  input=types.Blob(
196
- data=jpg_bytes.tobytes(),
197
  mime_type="image/jpeg"
198
  )
199
  )
200
-
201
  except Exception as e:
202
  print(f"❌ Error sending video frame: {e}")
203
-
204
  async def emit(self):
205
  """Emit audio back to user (Task 13)"""
206
  try:
@@ -208,23 +207,23 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
208
  return (24000, audio_chunk)
209
  except asyncio.QueueEmpty:
210
  return None
211
-
212
  def copy(self):
213
  """Copy method required by FastRTC AsyncAudioVideoStreamHandler"""
214
  # Return a new instance with same configuration
215
  new_instance = RealTimeScreenAssistant()
216
  new_instance.frame_interval = self.frame_interval
217
  return new_instance
218
-
219
  async def video_emit(self):
220
  """Video emit method required by FastRTC AsyncAudioVideoStreamHandler"""
221
  # For this use case, we don't emit video back to user
222
  return None
223
-
224
  async def shutdown(self):
225
  """Clean shutdown (Task 17)"""
226
  self.connected = False
227
-
228
  # Cancel response handler task if it exists
229
  if hasattr(self, 'response_task') and not self.response_task.done():
230
  self.response_task.cancel()
@@ -232,7 +231,7 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
232
  await self.response_task
233
  except asyncio.CancelledError:
234
  pass
235
-
236
  if self.session:
237
  try:
238
  # Properly close the session using context manager
@@ -243,7 +242,7 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
243
  print("πŸ”΄ Disconnected from GenAI Live API")
244
  except Exception as e:
245
  print(f"❌ Error during shutdown: {e}")
246
-
247
  self.session = None
248
  if hasattr(self, 'session_context'):
249
  self.session_context = None
@@ -257,19 +256,19 @@ def initialize_real_time_assistant():
257
  # Create handler
258
  handler = RealTimeScreenAssistant()
259
  app_state["handler"] = handler
260
-
261
  # Create stream with Cloudflare TURN (Task 22-23)
262
  stream = Stream(
263
  handler=ReplyOnPause(handler), # Voice activity detection (Task 3)
264
- modality="audio-video",
265
  mode="send-receive",
266
  rtc_configuration=get_cloudflare_turn_credentials_async,
267
  time_limit=300 # 5 minute limit for Spaces
268
  )
269
-
270
  app_state["stream"] = stream
271
  return stream
272
-
273
  except Exception as e:
274
  print(f"❌ Error creating stream: {e}")
275
  return None
@@ -280,10 +279,10 @@ def handle_connect():
280
  current_api_key = os.getenv("GEMINI_API_KEY", "")
281
  if not current_api_key:
282
  return "❌ Please set GEMINI_API_KEY environment variable"
283
-
284
  if app_state["connected"]:
285
  return "βœ… Already connected - session is active"
286
-
287
  app_state["connected"] = True
288
  return "βœ… Connecting... Please allow microphone and camera permissions"
289
 
@@ -293,7 +292,7 @@ def handle_disconnect():
293
  asyncio.create_task(app_state["handler"].shutdown())
294
  app_state["connected"] = False
295
  return "πŸ”΄ Disconnected from AI assistant"
296
-
297
  return "Already disconnected"
298
 
299
  # Screen sharing JavaScript - Fixed syntax for HF Spaces
@@ -303,7 +302,7 @@ screen_share_js = '''
303
  if (!navigator.mediaDevices || !navigator.mediaDevices.getDisplayMedia) {
304
  return "❌ Screen sharing not supported in this browser";
305
  }
306
-
307
  const stream = await navigator.mediaDevices.getDisplayMedia({
308
  video: {
309
  width: { ideal: 1920 },
@@ -311,36 +310,36 @@ screen_share_js = '''
311
  },
312
  audio: false
313
  });
314
-
315
  // Find the video element from FastRTC
316
  const videos = document.querySelectorAll('video');
317
  let targetVideo = null;
318
-
319
  for (let video of videos) {
320
  if (video.srcObject && video.srcObject.getVideoTracks().length > 0) {
321
  targetVideo = video;
322
  break;
323
  }
324
  }
325
-
326
  if (targetVideo && targetVideo.srcObject) {
327
  // Replace the camera track with screen track
328
  const screenTrack = stream.getVideoTracks()[0];
329
  const sender = targetVideo.srcObject.getVideoTracks()[0];
330
-
331
  // Remove old track and add screen track
332
  targetVideo.srcObject.removeTrack(sender);
333
  targetVideo.srcObject.addTrack(screenTrack);
334
-
335
  screenTrack.onended = () => {
336
  console.log("Screen sharing ended");
337
  };
338
-
339
  return "πŸ–₯️ Screen sharing started successfully!";
340
  } else {
341
  return "❌ Could not find video stream to replace";
342
  }
343
-
344
  } catch (error) {
345
  console.error("Screen sharing error:", error);
346
  if (error.name === "NotAllowedError") {
@@ -355,46 +354,45 @@ screen_share_js = '''
355
 
356
  def create_interface():
357
  """Create main interface (Task 26-30)"""
358
-
359
  # Initialize stream
360
  stream = initialize_real_time_assistant()
361
-
362
  with gr.Blocks(
363
- title="Real-Time Screen Assistant",
364
  theme=gr.themes.Soft()
365
  ) as demo:
366
-
367
  gr.Markdown("# πŸ–₯️ Real-Time Screen Assistant")
368
  gr.Markdown("""
369
  **🎯 LIVE AI that sees your screen and provides real-time guidance!**
370
-
371
  **How it works:**
372
  1. **Connect** - Links to Google's GenAI Live API for real-time AI processing
373
- 2. **Share Screen** - AI can see exactly what you're doing on your screen
374
  3. **Voice Chat** - Talk naturally, AI responds with voice and sees everything
375
  4. **Get Help** - Real-time assistance with software, coding, troubleshooting
376
-
377
  **Tech Stack:**
378
  - 🧠 Google GenAI Live API (multimodal real-time AI)
379
  - πŸ“Ή FastRTC (low-latency screen/audio streaming)
380
- - πŸŽ™οΈ Voice activity detection
381
  - 🌐 Cloudflare TURN servers (HF Spaces optimized)
382
  """)
383
-
384
  # Status display
385
  status_display = gr.Textbox(
386
  label="Status",
387
  value="Ready to connect - Click Connect to start real-time session",
388
  interactive=False
389
  )
390
-
391
  # Control buttons (Task 3, 16-17)
392
  with gr.Row():
393
  connect_btn = gr.Button("πŸ”— Connect", variant="primary")
394
  mic_btn = gr.Button("πŸŽ™οΈ Test Microphone", variant="secondary")
395
- screen_btn = gr.Button("πŸ–₯️ Show Your Screen", variant="secondary")
396
  disconnect_btn = gr.Button("πŸ”΄ Disconnect", variant="stop")
397
-
398
  # Stream interface - FastRTC UI for microphone and video
399
  gr.Markdown("### πŸ“‘ Live Audio/Video Stream")
400
  if stream:
@@ -407,7 +405,7 @@ def create_interface():
407
  """)
408
  else:
409
  gr.HTML("<div>⚠️ Stream initialization failed - Check console for errors</div>")
410
-
411
  # Microphone activation JavaScript
412
  microphone_js = '''
413
  (async function() {
@@ -433,19 +431,19 @@ def create_interface():
433
  }
434
  })()
435
  '''
436
-
437
  # Instructions (Task 1-3)
438
  with gr.Accordion("πŸ“‹ Instructions", open=True):
439
  gr.Markdown("""
440
  **How to use the real-time assistant:**
441
-
442
  1. **Connect**: Click Connect to start the AI session
443
  2. **Permissions**: Allow microphone and camera access
444
  3. **Show Screen**: Click "Show Your Screen" to share your screen
445
  4. **Voice Interaction**: Simply speak - the AI will respond
446
  5. **Real-time Guidance**: AI sees your screen and provides live help
447
  6. **Disconnect**: Click Disconnect when finished
448
-
449
  **Features implemented from refactoring instructions:**
450
  - βœ… FastRTC WebRTC streaming (Task 2)
451
  - βœ… Google GenAI Live API integration (Task 7-15)
@@ -455,7 +453,7 @@ def create_interface():
455
  - βœ… Real-time advice generation (Task 18-21)
456
  - βœ… Cloudflare TURN for HF Spaces (Task 22-23)
457
  """)
458
-
459
  # Privacy notice (Task 24-25)
460
  with gr.Accordion("πŸ”’ Privacy & Security", open=False):
461
  gr.Markdown("""
@@ -464,37 +462,37 @@ def create_interface():
464
  - Data is transmitted securely via encrypted WebRTC connections
465
  - No permanent storage - all processing is real-time
466
  - You control what is shared and can disconnect anytime
467
-
468
  **Technical Details:**
469
  - Uses Google Gemini Live API for real-time multimodal processing
470
  - FastRTC provides low-latency WebRTC streaming
471
  - Cloudflare TURN servers ensure reliable connectivity on HF Spaces
472
  - Voice activity detection prevents interruptions
473
  """)
474
-
475
  # Wire up controls
476
  connect_btn.click(
477
  fn=handle_connect,
478
  outputs=[status_display]
479
  )
480
-
481
  mic_btn.click(
482
  fn=lambda: "πŸŽ™οΈ Testing microphone...",
483
  outputs=[status_display],
484
  js=microphone_js
485
  )
486
-
487
  screen_btn.click(
488
  fn=lambda: "πŸ–₯️ Requesting screen share...",
489
  outputs=[status_display],
490
  js=screen_share_js
491
  )
492
-
493
  disconnect_btn.click(
494
  fn=handle_disconnect,
495
  outputs=[status_display]
496
  )
497
-
498
  return demo
499
 
500
  # Main execution
@@ -503,20 +501,20 @@ if __name__ == "__main__":
503
  print("=" * 50)
504
  print("Refactored according to instructions for:")
505
  print("- Google GenAI Live API integration")
506
- print("- FastRTC real-time streaming")
507
  print("- Voice activity detection")
508
  print("- Screen sharing capabilities")
509
  print("- Cloudflare TURN for HF Spaces")
510
-
511
  if not API_KEY:
512
  print("\n⚠️ No GEMINI_API_KEY environment variable found")
513
  print("Please set your Google AI API key:")
514
  print("export GEMINI_API_KEY='your-api-key-here'")
515
  else:
516
  print(f"\nβœ… API key configured (length: {len(API_KEY)})")
517
-
518
  print("\nπŸš€ Starting real-time assistant...")
519
-
520
  try:
521
  demo = create_interface()
522
  demo.launch(
@@ -527,4 +525,4 @@ if __name__ == "__main__":
527
  )
528
  except Exception as e:
529
  print(f"❌ Failed to launch: {e}")
530
- print("Ensure all dependencies are installed: pip install -r requirements.txt")
 
1
+ """Real-Time Screen Assistant - Refactored for Google GenAI Live API + FastRTC
2
+ xxx
 
3
  This application transforms the original screenshot analyzer into a real-time
4
+ screen sharing assistant with voice interaction, following the refactoring
5
  instructions for live streaming capabilities.
6
  """
7
 
 
8
  import asyncio
9
+ import os
10
  import time
11
+
 
12
  import cv2
13
  import gradio as gr
14
+ import numpy as np
15
+ import numpy.typing as npt
16
+ from fastrtc import AsyncAudioVideoStreamHandler, ReplyOnPause, Stream, get_cloudflare_turn_credentials_async
17
  from google import genai
18
  from google.genai import types
19
 
 
21
  API_KEY = os.getenv("GEMINI_API_KEY", "")
22
 
23
  class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
24
+ """Real-time screen assistant implementing the refactoring instructions.
25
+
 
26
  Features:
27
  - Google GenAI Live API integration
28
  - Real-time audio/video streaming via FastRTC
 
30
  - Intelligent frame sampling for screen sharing
31
  - Cloudflare TURN server support for HF Spaces
32
  """
33
+
34
  def __init__(self):
35
  super().__init__(
36
+ expected_layout="mono",
37
+ output_sample_rate=24000,
38
  input_sample_rate=16000
39
  )
40
  self.session = None
 
42
  self.audio_queue = asyncio.Queue()
43
  self.connected = False
44
  self.frame_interval = 1.0 # 1 FPS as per instructions
45
+
46
  async def start_up(self):
47
  """Initialize Google GenAI Live session as per Task 8-10"""
48
  try:
 
51
  if not current_api_key:
52
  print("❌ No GEMINI_API_KEY found in environment")
53
  return
54
+
55
  # Initialize client with v1alpha API (Task 8)
56
  client = genai.Client(
57
  api_key=current_api_key,
58
  http_options={"api_version": "v1alpha"}
59
  )
60
+
61
  # Configure live session (Task 9) - minimal working config
62
  from google.genai.types import LiveConnectConfig
63
+
64
  # Start with minimal config to avoid WebSocket errors
65
  config = LiveConnectConfig(
66
  system_instruction=(
 
69
  "you see and hear. Be proactive in offering assistance."
70
  )
71
  )
72
+
73
  # Connect to Live API (Task 10) - using async context manager
74
  self.session_context = client.aio.live.connect(
75
+ model="gemini-2.0-flash-live-001",
76
  config=config
77
  )
78
  self.session = await self.session_context.__aenter__()
79
+
80
  self.connected = True
81
  print("βœ… Connected to Google GenAI Live API")
82
+
83
  # Start response handler (Task 13)
84
  self.response_task = asyncio.create_task(self._handle_responses())
85
+
86
  except Exception as e:
87
  print(f"❌ Failed to connect to GenAI: {e}")
88
  self.connected = False
89
+
90
  async def _handle_responses(self):
91
  """Handle AI responses as per Task 12-13"""
92
  try:
 
96
  try:
97
  # Get the next message from the session
98
  response_stream = self.session.receive()
99
+
100
  # Check if this is an async iterator or needs to be awaited
101
  if hasattr(response_stream, '__aiter__'):
102
  # It's an async iterator
103
  async for msg in response_stream:
104
  if not self.connected:
105
  break
106
+
107
  if msg.data: # Audio response
108
  # Convert to numpy for FastRTC (Task 13)
109
  audio_array = np.frombuffer(msg.data, dtype=np.int16)
110
  if len(audio_array) > 0:
111
  audio_array = audio_array.reshape(1, -1)
112
  await self.audio_queue.put(audio_array)
113
+
114
  if msg.text: # Text response
115
  print(f"πŸ€– AI: {msg.text}")
116
  else:
 
122
  if len(audio_array) > 0:
123
  audio_array = audio_array.reshape(1, -1)
124
  await self.audio_queue.put(audio_array)
125
+
126
  if msg.text: # Text response
127
  print(f"πŸ€– AI: {msg.text}")
128
+
129
  except Exception as inner_e:
130
  if "connection" in str(inner_e).lower() or "closed" in str(inner_e).lower():
131
  print("πŸ”΄ Connection closed, stopping response handler")
 
133
  else:
134
  print(f"⚠️ Response handling error: {inner_e}")
135
  await asyncio.sleep(0.1) # Brief pause before retry
136
+
137
  except Exception as e:
138
  print(f"❌ Error handling AI responses: {e}")
139
+
140
  async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]):
141
  """Handle microphone audio (Task 11)"""
142
  if not self.connected or not self.session:
143
  return
144
+
145
  try:
146
  _, audio_np = frame
147
  audio_bytes = audio_np.tobytes()
148
+
149
  # Send audio to GenAI Live API using new non-deprecated method
150
  await self.session.send_realtime_input(
151
  input=types.Blob(
152
+ data=audio_bytes,
153
  mime_type="audio/pcm;rate=16000"
154
  )
155
  )
156
  except Exception as e:
157
  print(f"❌ Error sending audio: {e}")
158
+
159
  async def video_receive(self, frame: npt.NDArray[np.float32]):
160
  """Handle screen video frames (Task 11-12)"""
161
  if not self.connected or not self.session:
162
  return
163
+
164
  try:
165
  # Throttle to 1 FPS as per instructions
166
  current_time = time.time()
167
  if current_time - self.last_frame_time < self.frame_interval:
168
  return
169
+
170
  self.last_frame_time = current_time
171
+
172
  # Convert float32 frame to uint8 for JPEG encoding
173
  if frame.dtype == np.float32:
174
  # Assuming frame is in range [0, 1], convert to [0, 255]
175
  frame_uint8 = (frame * 255).astype(np.uint8)
176
  else:
177
  frame_uint8 = frame.astype(np.uint8)
178
+
179
  # Check for empty frame before encoding
180
  if frame_uint8.size == 0 or frame_uint8.shape[0] == 0 or frame_uint8.shape[1] == 0:
181
  return
182
+
183
  # Encode as JPEG (Task 12)
184
  try:
185
  success, jpg_bytes = cv2.imencode('.jpg', frame_uint8, [cv2.IMWRITE_JPEG_QUALITY, 80])
 
188
  except cv2.error:
189
  # Handle OpenCV encoding errors gracefully
190
  return
191
+
192
  # Send to GenAI using new non-deprecated method
193
  await self.session.send_realtime_input(
194
  input=types.Blob(
195
+ data=jpg_bytes.tobytes(),
196
  mime_type="image/jpeg"
197
  )
198
  )
199
+
200
  except Exception as e:
201
  print(f"❌ Error sending video frame: {e}")
202
+
203
  async def emit(self):
204
  """Emit audio back to user (Task 13)"""
205
  try:
 
207
  return (24000, audio_chunk)
208
  except asyncio.QueueEmpty:
209
  return None
210
+
211
  def copy(self):
212
  """Copy method required by FastRTC AsyncAudioVideoStreamHandler"""
213
  # Return a new instance with same configuration
214
  new_instance = RealTimeScreenAssistant()
215
  new_instance.frame_interval = self.frame_interval
216
  return new_instance
217
+
218
  async def video_emit(self):
219
  """Video emit method required by FastRTC AsyncAudioVideoStreamHandler"""
220
  # For this use case, we don't emit video back to user
221
  return None
222
+
223
  async def shutdown(self):
224
  """Clean shutdown (Task 17)"""
225
  self.connected = False
226
+
227
  # Cancel response handler task if it exists
228
  if hasattr(self, 'response_task') and not self.response_task.done():
229
  self.response_task.cancel()
 
231
  await self.response_task
232
  except asyncio.CancelledError:
233
  pass
234
+
235
  if self.session:
236
  try:
237
  # Properly close the session using context manager
 
242
  print("πŸ”΄ Disconnected from GenAI Live API")
243
  except Exception as e:
244
  print(f"❌ Error during shutdown: {e}")
245
+
246
  self.session = None
247
  if hasattr(self, 'session_context'):
248
  self.session_context = None
 
256
  # Create handler
257
  handler = RealTimeScreenAssistant()
258
  app_state["handler"] = handler
259
+
260
  # Create stream with Cloudflare TURN (Task 22-23)
261
  stream = Stream(
262
  handler=ReplyOnPause(handler), # Voice activity detection (Task 3)
263
+ modality="audio-video",
264
  mode="send-receive",
265
  rtc_configuration=get_cloudflare_turn_credentials_async,
266
  time_limit=300 # 5 minute limit for Spaces
267
  )
268
+
269
  app_state["stream"] = stream
270
  return stream
271
+
272
  except Exception as e:
273
  print(f"❌ Error creating stream: {e}")
274
  return None
 
279
  current_api_key = os.getenv("GEMINI_API_KEY", "")
280
  if not current_api_key:
281
  return "❌ Please set GEMINI_API_KEY environment variable"
282
+
283
  if app_state["connected"]:
284
  return "βœ… Already connected - session is active"
285
+
286
  app_state["connected"] = True
287
  return "βœ… Connecting... Please allow microphone and camera permissions"
288
 
 
292
  asyncio.create_task(app_state["handler"].shutdown())
293
  app_state["connected"] = False
294
  return "πŸ”΄ Disconnected from AI assistant"
295
+
296
  return "Already disconnected"
297
 
298
  # Screen sharing JavaScript - Fixed syntax for HF Spaces
 
302
  if (!navigator.mediaDevices || !navigator.mediaDevices.getDisplayMedia) {
303
  return "❌ Screen sharing not supported in this browser";
304
  }
305
+
306
  const stream = await navigator.mediaDevices.getDisplayMedia({
307
  video: {
308
  width: { ideal: 1920 },
 
310
  },
311
  audio: false
312
  });
313
+
314
  // Find the video element from FastRTC
315
  const videos = document.querySelectorAll('video');
316
  let targetVideo = null;
317
+
318
  for (let video of videos) {
319
  if (video.srcObject && video.srcObject.getVideoTracks().length > 0) {
320
  targetVideo = video;
321
  break;
322
  }
323
  }
324
+
325
  if (targetVideo && targetVideo.srcObject) {
326
  // Replace the camera track with screen track
327
  const screenTrack = stream.getVideoTracks()[0];
328
  const sender = targetVideo.srcObject.getVideoTracks()[0];
329
+
330
  // Remove old track and add screen track
331
  targetVideo.srcObject.removeTrack(sender);
332
  targetVideo.srcObject.addTrack(screenTrack);
333
+
334
  screenTrack.onended = () => {
335
  console.log("Screen sharing ended");
336
  };
337
+
338
  return "πŸ–₯️ Screen sharing started successfully!";
339
  } else {
340
  return "❌ Could not find video stream to replace";
341
  }
342
+
343
  } catch (error) {
344
  console.error("Screen sharing error:", error);
345
  if (error.name === "NotAllowedError") {
 
354
 
355
  def create_interface():
356
  """Create main interface (Task 26-30)"""
 
357
  # Initialize stream
358
  stream = initialize_real_time_assistant()
359
+
360
  with gr.Blocks(
361
+ title="Real-Time Screen Assistant",
362
  theme=gr.themes.Soft()
363
  ) as demo:
364
+
365
  gr.Markdown("# πŸ–₯️ Real-Time Screen Assistant")
366
  gr.Markdown("""
367
  **🎯 LIVE AI that sees your screen and provides real-time guidance!**
368
+
369
  **How it works:**
370
  1. **Connect** - Links to Google's GenAI Live API for real-time AI processing
371
+ 2. **Share Screen** - AI can see exactly what you're doing on your screen
372
  3. **Voice Chat** - Talk naturally, AI responds with voice and sees everything
373
  4. **Get Help** - Real-time assistance with software, coding, troubleshooting
374
+
375
  **Tech Stack:**
376
  - 🧠 Google GenAI Live API (multimodal real-time AI)
377
  - πŸ“Ή FastRTC (low-latency screen/audio streaming)
378
+ - πŸŽ™οΈ Voice activity detection
379
  - 🌐 Cloudflare TURN servers (HF Spaces optimized)
380
  """)
381
+
382
  # Status display
383
  status_display = gr.Textbox(
384
  label="Status",
385
  value="Ready to connect - Click Connect to start real-time session",
386
  interactive=False
387
  )
388
+
389
  # Control buttons (Task 3, 16-17)
390
  with gr.Row():
391
  connect_btn = gr.Button("πŸ”— Connect", variant="primary")
392
  mic_btn = gr.Button("πŸŽ™οΈ Test Microphone", variant="secondary")
393
+ screen_btn = gr.Button("πŸ–₯️ Show Your Screen", variant="secondary")
394
  disconnect_btn = gr.Button("πŸ”΄ Disconnect", variant="stop")
395
+
396
  # Stream interface - FastRTC UI for microphone and video
397
  gr.Markdown("### πŸ“‘ Live Audio/Video Stream")
398
  if stream:
 
405
  """)
406
  else:
407
  gr.HTML("<div>⚠️ Stream initialization failed - Check console for errors</div>")
408
+
409
  # Microphone activation JavaScript
410
  microphone_js = '''
411
  (async function() {
 
431
  }
432
  })()
433
  '''
434
+
435
  # Instructions (Task 1-3)
436
  with gr.Accordion("πŸ“‹ Instructions", open=True):
437
  gr.Markdown("""
438
  **How to use the real-time assistant:**
439
+
440
  1. **Connect**: Click Connect to start the AI session
441
  2. **Permissions**: Allow microphone and camera access
442
  3. **Show Screen**: Click "Show Your Screen" to share your screen
443
  4. **Voice Interaction**: Simply speak - the AI will respond
444
  5. **Real-time Guidance**: AI sees your screen and provides live help
445
  6. **Disconnect**: Click Disconnect when finished
446
+
447
  **Features implemented from refactoring instructions:**
448
  - βœ… FastRTC WebRTC streaming (Task 2)
449
  - βœ… Google GenAI Live API integration (Task 7-15)
 
453
  - βœ… Real-time advice generation (Task 18-21)
454
  - βœ… Cloudflare TURN for HF Spaces (Task 22-23)
455
  """)
456
+
457
  # Privacy notice (Task 24-25)
458
  with gr.Accordion("πŸ”’ Privacy & Security", open=False):
459
  gr.Markdown("""
 
462
  - Data is transmitted securely via encrypted WebRTC connections
463
  - No permanent storage - all processing is real-time
464
  - You control what is shared and can disconnect anytime
465
+
466
  **Technical Details:**
467
  - Uses Google Gemini Live API for real-time multimodal processing
468
  - FastRTC provides low-latency WebRTC streaming
469
  - Cloudflare TURN servers ensure reliable connectivity on HF Spaces
470
  - Voice activity detection prevents interruptions
471
  """)
472
+
473
  # Wire up controls
474
  connect_btn.click(
475
  fn=handle_connect,
476
  outputs=[status_display]
477
  )
478
+
479
  mic_btn.click(
480
  fn=lambda: "πŸŽ™οΈ Testing microphone...",
481
  outputs=[status_display],
482
  js=microphone_js
483
  )
484
+
485
  screen_btn.click(
486
  fn=lambda: "πŸ–₯️ Requesting screen share...",
487
  outputs=[status_display],
488
  js=screen_share_js
489
  )
490
+
491
  disconnect_btn.click(
492
  fn=handle_disconnect,
493
  outputs=[status_display]
494
  )
495
+
496
  return demo
497
 
498
  # Main execution
 
501
  print("=" * 50)
502
  print("Refactored according to instructions for:")
503
  print("- Google GenAI Live API integration")
504
+ print("- FastRTC real-time streaming")
505
  print("- Voice activity detection")
506
  print("- Screen sharing capabilities")
507
  print("- Cloudflare TURN for HF Spaces")
508
+
509
  if not API_KEY:
510
  print("\n⚠️ No GEMINI_API_KEY environment variable found")
511
  print("Please set your Google AI API key:")
512
  print("export GEMINI_API_KEY='your-api-key-here'")
513
  else:
514
  print(f"\nβœ… API key configured (length: {len(API_KEY)})")
515
+
516
  print("\nπŸš€ Starting real-time assistant...")
517
+
518
  try:
519
  demo = create_interface()
520
  demo.launch(
 
525
  )
526
  except Exception as e:
527
  print(f"❌ Failed to launch: {e}")
528
+ print("Ensure all dependencies are installed: pip install -r requirements.txt")