Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,19 +1,19 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
This application transforms the original screenshot analyzer into a real-time
|
| 5 |
-
screen sharing assistant with voice interaction, following the refactoring
|
| 6 |
instructions for live streaming capabilities.
|
| 7 |
"""
|
| 8 |
|
| 9 |
-
import os
|
| 10 |
import asyncio
|
|
|
|
| 11 |
import time
|
| 12 |
-
|
| 13 |
-
import numpy.typing as npt
|
| 14 |
import cv2
|
| 15 |
import gradio as gr
|
| 16 |
-
|
|
|
|
|
|
|
| 17 |
from google import genai
|
| 18 |
from google.genai import types
|
| 19 |
|
|
@@ -21,9 +21,8 @@ from google.genai import types
|
|
| 21 |
API_KEY = os.getenv("GEMINI_API_KEY", "")
|
| 22 |
|
| 23 |
class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
| 24 |
-
"""
|
| 25 |
-
|
| 26 |
-
|
| 27 |
Features:
|
| 28 |
- Google GenAI Live API integration
|
| 29 |
- Real-time audio/video streaming via FastRTC
|
|
@@ -31,11 +30,11 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
| 31 |
- Intelligent frame sampling for screen sharing
|
| 32 |
- Cloudflare TURN server support for HF Spaces
|
| 33 |
"""
|
| 34 |
-
|
| 35 |
def __init__(self):
|
| 36 |
super().__init__(
|
| 37 |
-
expected_layout="mono",
|
| 38 |
-
output_sample_rate=24000,
|
| 39 |
input_sample_rate=16000
|
| 40 |
)
|
| 41 |
self.session = None
|
|
@@ -43,7 +42,7 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
| 43 |
self.audio_queue = asyncio.Queue()
|
| 44 |
self.connected = False
|
| 45 |
self.frame_interval = 1.0 # 1 FPS as per instructions
|
| 46 |
-
|
| 47 |
async def start_up(self):
|
| 48 |
"""Initialize Google GenAI Live session as per Task 8-10"""
|
| 49 |
try:
|
|
@@ -52,16 +51,16 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
| 52 |
if not current_api_key:
|
| 53 |
print("β No GEMINI_API_KEY found in environment")
|
| 54 |
return
|
| 55 |
-
|
| 56 |
# Initialize client with v1alpha API (Task 8)
|
| 57 |
client = genai.Client(
|
| 58 |
api_key=current_api_key,
|
| 59 |
http_options={"api_version": "v1alpha"}
|
| 60 |
)
|
| 61 |
-
|
| 62 |
# Configure live session (Task 9) - minimal working config
|
| 63 |
from google.genai.types import LiveConnectConfig
|
| 64 |
-
|
| 65 |
# Start with minimal config to avoid WebSocket errors
|
| 66 |
config = LiveConnectConfig(
|
| 67 |
system_instruction=(
|
|
@@ -70,24 +69,24 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
| 70 |
"you see and hear. Be proactive in offering assistance."
|
| 71 |
)
|
| 72 |
)
|
| 73 |
-
|
| 74 |
# Connect to Live API (Task 10) - using async context manager
|
| 75 |
self.session_context = client.aio.live.connect(
|
| 76 |
-
model="gemini-2.0-flash-live-001",
|
| 77 |
config=config
|
| 78 |
)
|
| 79 |
self.session = await self.session_context.__aenter__()
|
| 80 |
-
|
| 81 |
self.connected = True
|
| 82 |
print("β
Connected to Google GenAI Live API")
|
| 83 |
-
|
| 84 |
# Start response handler (Task 13)
|
| 85 |
self.response_task = asyncio.create_task(self._handle_responses())
|
| 86 |
-
|
| 87 |
except Exception as e:
|
| 88 |
print(f"β Failed to connect to GenAI: {e}")
|
| 89 |
self.connected = False
|
| 90 |
-
|
| 91 |
async def _handle_responses(self):
|
| 92 |
"""Handle AI responses as per Task 12-13"""
|
| 93 |
try:
|
|
@@ -97,21 +96,21 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
| 97 |
try:
|
| 98 |
# Get the next message from the session
|
| 99 |
response_stream = self.session.receive()
|
| 100 |
-
|
| 101 |
# Check if this is an async iterator or needs to be awaited
|
| 102 |
if hasattr(response_stream, '__aiter__'):
|
| 103 |
# It's an async iterator
|
| 104 |
async for msg in response_stream:
|
| 105 |
if not self.connected:
|
| 106 |
break
|
| 107 |
-
|
| 108 |
if msg.data: # Audio response
|
| 109 |
# Convert to numpy for FastRTC (Task 13)
|
| 110 |
audio_array = np.frombuffer(msg.data, dtype=np.int16)
|
| 111 |
if len(audio_array) > 0:
|
| 112 |
audio_array = audio_array.reshape(1, -1)
|
| 113 |
await self.audio_queue.put(audio_array)
|
| 114 |
-
|
| 115 |
if msg.text: # Text response
|
| 116 |
print(f"π€ AI: {msg.text}")
|
| 117 |
else:
|
|
@@ -123,10 +122,10 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
| 123 |
if len(audio_array) > 0:
|
| 124 |
audio_array = audio_array.reshape(1, -1)
|
| 125 |
await self.audio_queue.put(audio_array)
|
| 126 |
-
|
| 127 |
if msg.text: # Text response
|
| 128 |
print(f"π€ AI: {msg.text}")
|
| 129 |
-
|
| 130 |
except Exception as inner_e:
|
| 131 |
if "connection" in str(inner_e).lower() or "closed" in str(inner_e).lower():
|
| 132 |
print("π΄ Connection closed, stopping response handler")
|
|
@@ -134,53 +133,53 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
| 134 |
else:
|
| 135 |
print(f"β οΈ Response handling error: {inner_e}")
|
| 136 |
await asyncio.sleep(0.1) # Brief pause before retry
|
| 137 |
-
|
| 138 |
except Exception as e:
|
| 139 |
print(f"β Error handling AI responses: {e}")
|
| 140 |
-
|
| 141 |
async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]):
|
| 142 |
"""Handle microphone audio (Task 11)"""
|
| 143 |
if not self.connected or not self.session:
|
| 144 |
return
|
| 145 |
-
|
| 146 |
try:
|
| 147 |
_, audio_np = frame
|
| 148 |
audio_bytes = audio_np.tobytes()
|
| 149 |
-
|
| 150 |
# Send audio to GenAI Live API using new non-deprecated method
|
| 151 |
await self.session.send_realtime_input(
|
| 152 |
input=types.Blob(
|
| 153 |
-
data=audio_bytes,
|
| 154 |
mime_type="audio/pcm;rate=16000"
|
| 155 |
)
|
| 156 |
)
|
| 157 |
except Exception as e:
|
| 158 |
print(f"β Error sending audio: {e}")
|
| 159 |
-
|
| 160 |
async def video_receive(self, frame: npt.NDArray[np.float32]):
|
| 161 |
"""Handle screen video frames (Task 11-12)"""
|
| 162 |
if not self.connected or not self.session:
|
| 163 |
return
|
| 164 |
-
|
| 165 |
try:
|
| 166 |
# Throttle to 1 FPS as per instructions
|
| 167 |
current_time = time.time()
|
| 168 |
if current_time - self.last_frame_time < self.frame_interval:
|
| 169 |
return
|
| 170 |
-
|
| 171 |
self.last_frame_time = current_time
|
| 172 |
-
|
| 173 |
# Convert float32 frame to uint8 for JPEG encoding
|
| 174 |
if frame.dtype == np.float32:
|
| 175 |
# Assuming frame is in range [0, 1], convert to [0, 255]
|
| 176 |
frame_uint8 = (frame * 255).astype(np.uint8)
|
| 177 |
else:
|
| 178 |
frame_uint8 = frame.astype(np.uint8)
|
| 179 |
-
|
| 180 |
# Check for empty frame before encoding
|
| 181 |
if frame_uint8.size == 0 or frame_uint8.shape[0] == 0 or frame_uint8.shape[1] == 0:
|
| 182 |
return
|
| 183 |
-
|
| 184 |
# Encode as JPEG (Task 12)
|
| 185 |
try:
|
| 186 |
success, jpg_bytes = cv2.imencode('.jpg', frame_uint8, [cv2.IMWRITE_JPEG_QUALITY, 80])
|
|
@@ -189,18 +188,18 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
| 189 |
except cv2.error:
|
| 190 |
# Handle OpenCV encoding errors gracefully
|
| 191 |
return
|
| 192 |
-
|
| 193 |
# Send to GenAI using new non-deprecated method
|
| 194 |
await self.session.send_realtime_input(
|
| 195 |
input=types.Blob(
|
| 196 |
-
data=jpg_bytes.tobytes(),
|
| 197 |
mime_type="image/jpeg"
|
| 198 |
)
|
| 199 |
)
|
| 200 |
-
|
| 201 |
except Exception as e:
|
| 202 |
print(f"β Error sending video frame: {e}")
|
| 203 |
-
|
| 204 |
async def emit(self):
|
| 205 |
"""Emit audio back to user (Task 13)"""
|
| 206 |
try:
|
|
@@ -208,23 +207,23 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
| 208 |
return (24000, audio_chunk)
|
| 209 |
except asyncio.QueueEmpty:
|
| 210 |
return None
|
| 211 |
-
|
| 212 |
def copy(self):
|
| 213 |
"""Copy method required by FastRTC AsyncAudioVideoStreamHandler"""
|
| 214 |
# Return a new instance with same configuration
|
| 215 |
new_instance = RealTimeScreenAssistant()
|
| 216 |
new_instance.frame_interval = self.frame_interval
|
| 217 |
return new_instance
|
| 218 |
-
|
| 219 |
async def video_emit(self):
|
| 220 |
"""Video emit method required by FastRTC AsyncAudioVideoStreamHandler"""
|
| 221 |
# For this use case, we don't emit video back to user
|
| 222 |
return None
|
| 223 |
-
|
| 224 |
async def shutdown(self):
|
| 225 |
"""Clean shutdown (Task 17)"""
|
| 226 |
self.connected = False
|
| 227 |
-
|
| 228 |
# Cancel response handler task if it exists
|
| 229 |
if hasattr(self, 'response_task') and not self.response_task.done():
|
| 230 |
self.response_task.cancel()
|
|
@@ -232,7 +231,7 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
| 232 |
await self.response_task
|
| 233 |
except asyncio.CancelledError:
|
| 234 |
pass
|
| 235 |
-
|
| 236 |
if self.session:
|
| 237 |
try:
|
| 238 |
# Properly close the session using context manager
|
|
@@ -243,7 +242,7 @@ class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
|
| 243 |
print("π΄ Disconnected from GenAI Live API")
|
| 244 |
except Exception as e:
|
| 245 |
print(f"β Error during shutdown: {e}")
|
| 246 |
-
|
| 247 |
self.session = None
|
| 248 |
if hasattr(self, 'session_context'):
|
| 249 |
self.session_context = None
|
|
@@ -257,19 +256,19 @@ def initialize_real_time_assistant():
|
|
| 257 |
# Create handler
|
| 258 |
handler = RealTimeScreenAssistant()
|
| 259 |
app_state["handler"] = handler
|
| 260 |
-
|
| 261 |
# Create stream with Cloudflare TURN (Task 22-23)
|
| 262 |
stream = Stream(
|
| 263 |
handler=ReplyOnPause(handler), # Voice activity detection (Task 3)
|
| 264 |
-
modality="audio-video",
|
| 265 |
mode="send-receive",
|
| 266 |
rtc_configuration=get_cloudflare_turn_credentials_async,
|
| 267 |
time_limit=300 # 5 minute limit for Spaces
|
| 268 |
)
|
| 269 |
-
|
| 270 |
app_state["stream"] = stream
|
| 271 |
return stream
|
| 272 |
-
|
| 273 |
except Exception as e:
|
| 274 |
print(f"β Error creating stream: {e}")
|
| 275 |
return None
|
|
@@ -280,10 +279,10 @@ def handle_connect():
|
|
| 280 |
current_api_key = os.getenv("GEMINI_API_KEY", "")
|
| 281 |
if not current_api_key:
|
| 282 |
return "β Please set GEMINI_API_KEY environment variable"
|
| 283 |
-
|
| 284 |
if app_state["connected"]:
|
| 285 |
return "β
Already connected - session is active"
|
| 286 |
-
|
| 287 |
app_state["connected"] = True
|
| 288 |
return "β
Connecting... Please allow microphone and camera permissions"
|
| 289 |
|
|
@@ -293,7 +292,7 @@ def handle_disconnect():
|
|
| 293 |
asyncio.create_task(app_state["handler"].shutdown())
|
| 294 |
app_state["connected"] = False
|
| 295 |
return "π΄ Disconnected from AI assistant"
|
| 296 |
-
|
| 297 |
return "Already disconnected"
|
| 298 |
|
| 299 |
# Screen sharing JavaScript - Fixed syntax for HF Spaces
|
|
@@ -303,7 +302,7 @@ screen_share_js = '''
|
|
| 303 |
if (!navigator.mediaDevices || !navigator.mediaDevices.getDisplayMedia) {
|
| 304 |
return "β Screen sharing not supported in this browser";
|
| 305 |
}
|
| 306 |
-
|
| 307 |
const stream = await navigator.mediaDevices.getDisplayMedia({
|
| 308 |
video: {
|
| 309 |
width: { ideal: 1920 },
|
|
@@ -311,36 +310,36 @@ screen_share_js = '''
|
|
| 311 |
},
|
| 312 |
audio: false
|
| 313 |
});
|
| 314 |
-
|
| 315 |
// Find the video element from FastRTC
|
| 316 |
const videos = document.querySelectorAll('video');
|
| 317 |
let targetVideo = null;
|
| 318 |
-
|
| 319 |
for (let video of videos) {
|
| 320 |
if (video.srcObject && video.srcObject.getVideoTracks().length > 0) {
|
| 321 |
targetVideo = video;
|
| 322 |
break;
|
| 323 |
}
|
| 324 |
}
|
| 325 |
-
|
| 326 |
if (targetVideo && targetVideo.srcObject) {
|
| 327 |
// Replace the camera track with screen track
|
| 328 |
const screenTrack = stream.getVideoTracks()[0];
|
| 329 |
const sender = targetVideo.srcObject.getVideoTracks()[0];
|
| 330 |
-
|
| 331 |
// Remove old track and add screen track
|
| 332 |
targetVideo.srcObject.removeTrack(sender);
|
| 333 |
targetVideo.srcObject.addTrack(screenTrack);
|
| 334 |
-
|
| 335 |
screenTrack.onended = () => {
|
| 336 |
console.log("Screen sharing ended");
|
| 337 |
};
|
| 338 |
-
|
| 339 |
return "π₯οΈ Screen sharing started successfully!";
|
| 340 |
} else {
|
| 341 |
return "β Could not find video stream to replace";
|
| 342 |
}
|
| 343 |
-
|
| 344 |
} catch (error) {
|
| 345 |
console.error("Screen sharing error:", error);
|
| 346 |
if (error.name === "NotAllowedError") {
|
|
@@ -355,46 +354,45 @@ screen_share_js = '''
|
|
| 355 |
|
| 356 |
def create_interface():
|
| 357 |
"""Create main interface (Task 26-30)"""
|
| 358 |
-
|
| 359 |
# Initialize stream
|
| 360 |
stream = initialize_real_time_assistant()
|
| 361 |
-
|
| 362 |
with gr.Blocks(
|
| 363 |
-
title="Real-Time Screen Assistant",
|
| 364 |
theme=gr.themes.Soft()
|
| 365 |
) as demo:
|
| 366 |
-
|
| 367 |
gr.Markdown("# π₯οΈ Real-Time Screen Assistant")
|
| 368 |
gr.Markdown("""
|
| 369 |
**π― LIVE AI that sees your screen and provides real-time guidance!**
|
| 370 |
-
|
| 371 |
**How it works:**
|
| 372 |
1. **Connect** - Links to Google's GenAI Live API for real-time AI processing
|
| 373 |
-
2. **Share Screen** - AI can see exactly what you're doing on your screen
|
| 374 |
3. **Voice Chat** - Talk naturally, AI responds with voice and sees everything
|
| 375 |
4. **Get Help** - Real-time assistance with software, coding, troubleshooting
|
| 376 |
-
|
| 377 |
**Tech Stack:**
|
| 378 |
- π§ Google GenAI Live API (multimodal real-time AI)
|
| 379 |
- πΉ FastRTC (low-latency screen/audio streaming)
|
| 380 |
-
- ποΈ Voice activity detection
|
| 381 |
- π Cloudflare TURN servers (HF Spaces optimized)
|
| 382 |
""")
|
| 383 |
-
|
| 384 |
# Status display
|
| 385 |
status_display = gr.Textbox(
|
| 386 |
label="Status",
|
| 387 |
value="Ready to connect - Click Connect to start real-time session",
|
| 388 |
interactive=False
|
| 389 |
)
|
| 390 |
-
|
| 391 |
# Control buttons (Task 3, 16-17)
|
| 392 |
with gr.Row():
|
| 393 |
connect_btn = gr.Button("π Connect", variant="primary")
|
| 394 |
mic_btn = gr.Button("ποΈ Test Microphone", variant="secondary")
|
| 395 |
-
screen_btn = gr.Button("π₯οΈ Show Your Screen", variant="secondary")
|
| 396 |
disconnect_btn = gr.Button("π΄ Disconnect", variant="stop")
|
| 397 |
-
|
| 398 |
# Stream interface - FastRTC UI for microphone and video
|
| 399 |
gr.Markdown("### π‘ Live Audio/Video Stream")
|
| 400 |
if stream:
|
|
@@ -407,7 +405,7 @@ def create_interface():
|
|
| 407 |
""")
|
| 408 |
else:
|
| 409 |
gr.HTML("<div>β οΈ Stream initialization failed - Check console for errors</div>")
|
| 410 |
-
|
| 411 |
# Microphone activation JavaScript
|
| 412 |
microphone_js = '''
|
| 413 |
(async function() {
|
|
@@ -433,19 +431,19 @@ def create_interface():
|
|
| 433 |
}
|
| 434 |
})()
|
| 435 |
'''
|
| 436 |
-
|
| 437 |
# Instructions (Task 1-3)
|
| 438 |
with gr.Accordion("π Instructions", open=True):
|
| 439 |
gr.Markdown("""
|
| 440 |
**How to use the real-time assistant:**
|
| 441 |
-
|
| 442 |
1. **Connect**: Click Connect to start the AI session
|
| 443 |
2. **Permissions**: Allow microphone and camera access
|
| 444 |
3. **Show Screen**: Click "Show Your Screen" to share your screen
|
| 445 |
4. **Voice Interaction**: Simply speak - the AI will respond
|
| 446 |
5. **Real-time Guidance**: AI sees your screen and provides live help
|
| 447 |
6. **Disconnect**: Click Disconnect when finished
|
| 448 |
-
|
| 449 |
**Features implemented from refactoring instructions:**
|
| 450 |
- β
FastRTC WebRTC streaming (Task 2)
|
| 451 |
- β
Google GenAI Live API integration (Task 7-15)
|
|
@@ -455,7 +453,7 @@ def create_interface():
|
|
| 455 |
- β
Real-time advice generation (Task 18-21)
|
| 456 |
- β
Cloudflare TURN for HF Spaces (Task 22-23)
|
| 457 |
""")
|
| 458 |
-
|
| 459 |
# Privacy notice (Task 24-25)
|
| 460 |
with gr.Accordion("π Privacy & Security", open=False):
|
| 461 |
gr.Markdown("""
|
|
@@ -464,37 +462,37 @@ def create_interface():
|
|
| 464 |
- Data is transmitted securely via encrypted WebRTC connections
|
| 465 |
- No permanent storage - all processing is real-time
|
| 466 |
- You control what is shared and can disconnect anytime
|
| 467 |
-
|
| 468 |
**Technical Details:**
|
| 469 |
- Uses Google Gemini Live API for real-time multimodal processing
|
| 470 |
- FastRTC provides low-latency WebRTC streaming
|
| 471 |
- Cloudflare TURN servers ensure reliable connectivity on HF Spaces
|
| 472 |
- Voice activity detection prevents interruptions
|
| 473 |
""")
|
| 474 |
-
|
| 475 |
# Wire up controls
|
| 476 |
connect_btn.click(
|
| 477 |
fn=handle_connect,
|
| 478 |
outputs=[status_display]
|
| 479 |
)
|
| 480 |
-
|
| 481 |
mic_btn.click(
|
| 482 |
fn=lambda: "ποΈ Testing microphone...",
|
| 483 |
outputs=[status_display],
|
| 484 |
js=microphone_js
|
| 485 |
)
|
| 486 |
-
|
| 487 |
screen_btn.click(
|
| 488 |
fn=lambda: "π₯οΈ Requesting screen share...",
|
| 489 |
outputs=[status_display],
|
| 490 |
js=screen_share_js
|
| 491 |
)
|
| 492 |
-
|
| 493 |
disconnect_btn.click(
|
| 494 |
fn=handle_disconnect,
|
| 495 |
outputs=[status_display]
|
| 496 |
)
|
| 497 |
-
|
| 498 |
return demo
|
| 499 |
|
| 500 |
# Main execution
|
|
@@ -503,20 +501,20 @@ if __name__ == "__main__":
|
|
| 503 |
print("=" * 50)
|
| 504 |
print("Refactored according to instructions for:")
|
| 505 |
print("- Google GenAI Live API integration")
|
| 506 |
-
print("- FastRTC real-time streaming")
|
| 507 |
print("- Voice activity detection")
|
| 508 |
print("- Screen sharing capabilities")
|
| 509 |
print("- Cloudflare TURN for HF Spaces")
|
| 510 |
-
|
| 511 |
if not API_KEY:
|
| 512 |
print("\nβ οΈ No GEMINI_API_KEY environment variable found")
|
| 513 |
print("Please set your Google AI API key:")
|
| 514 |
print("export GEMINI_API_KEY='your-api-key-here'")
|
| 515 |
else:
|
| 516 |
print(f"\nβ
API key configured (length: {len(API_KEY)})")
|
| 517 |
-
|
| 518 |
print("\nπ Starting real-time assistant...")
|
| 519 |
-
|
| 520 |
try:
|
| 521 |
demo = create_interface()
|
| 522 |
demo.launch(
|
|
@@ -527,4 +525,4 @@ if __name__ == "__main__":
|
|
| 527 |
)
|
| 528 |
except Exception as e:
|
| 529 |
print(f"β Failed to launch: {e}")
|
| 530 |
-
print("Ensure all dependencies are installed: pip install -r requirements.txt")
|
|
|
|
| 1 |
+
"""Real-Time Screen Assistant - Refactored for Google GenAI Live API + FastRTC
|
| 2 |
+
xxx
|
|
|
|
| 3 |
This application transforms the original screenshot analyzer into a real-time
|
| 4 |
+
screen sharing assistant with voice interaction, following the refactoring
|
| 5 |
instructions for live streaming capabilities.
|
| 6 |
"""
|
| 7 |
|
|
|
|
| 8 |
import asyncio
|
| 9 |
+
import os
|
| 10 |
import time
|
| 11 |
+
|
|
|
|
| 12 |
import cv2
|
| 13 |
import gradio as gr
|
| 14 |
+
import numpy as np
|
| 15 |
+
import numpy.typing as npt
|
| 16 |
+
from fastrtc import AsyncAudioVideoStreamHandler, ReplyOnPause, Stream, get_cloudflare_turn_credentials_async
|
| 17 |
from google import genai
|
| 18 |
from google.genai import types
|
| 19 |
|
|
|
|
| 21 |
API_KEY = os.getenv("GEMINI_API_KEY", "")
|
| 22 |
|
| 23 |
class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
| 24 |
+
"""Real-time screen assistant implementing the refactoring instructions.
|
| 25 |
+
|
|
|
|
| 26 |
Features:
|
| 27 |
- Google GenAI Live API integration
|
| 28 |
- Real-time audio/video streaming via FastRTC
|
|
|
|
| 30 |
- Intelligent frame sampling for screen sharing
|
| 31 |
- Cloudflare TURN server support for HF Spaces
|
| 32 |
"""
|
| 33 |
+
|
| 34 |
def __init__(self):
|
| 35 |
super().__init__(
|
| 36 |
+
expected_layout="mono",
|
| 37 |
+
output_sample_rate=24000,
|
| 38 |
input_sample_rate=16000
|
| 39 |
)
|
| 40 |
self.session = None
|
|
|
|
| 42 |
self.audio_queue = asyncio.Queue()
|
| 43 |
self.connected = False
|
| 44 |
self.frame_interval = 1.0 # 1 FPS as per instructions
|
| 45 |
+
|
| 46 |
async def start_up(self):
|
| 47 |
"""Initialize Google GenAI Live session as per Task 8-10"""
|
| 48 |
try:
|
|
|
|
| 51 |
if not current_api_key:
|
| 52 |
print("β No GEMINI_API_KEY found in environment")
|
| 53 |
return
|
| 54 |
+
|
| 55 |
# Initialize client with v1alpha API (Task 8)
|
| 56 |
client = genai.Client(
|
| 57 |
api_key=current_api_key,
|
| 58 |
http_options={"api_version": "v1alpha"}
|
| 59 |
)
|
| 60 |
+
|
| 61 |
# Configure live session (Task 9) - minimal working config
|
| 62 |
from google.genai.types import LiveConnectConfig
|
| 63 |
+
|
| 64 |
# Start with minimal config to avoid WebSocket errors
|
| 65 |
config = LiveConnectConfig(
|
| 66 |
system_instruction=(
|
|
|
|
| 69 |
"you see and hear. Be proactive in offering assistance."
|
| 70 |
)
|
| 71 |
)
|
| 72 |
+
|
| 73 |
# Connect to Live API (Task 10) - using async context manager
|
| 74 |
self.session_context = client.aio.live.connect(
|
| 75 |
+
model="gemini-2.0-flash-live-001",
|
| 76 |
config=config
|
| 77 |
)
|
| 78 |
self.session = await self.session_context.__aenter__()
|
| 79 |
+
|
| 80 |
self.connected = True
|
| 81 |
print("β
Connected to Google GenAI Live API")
|
| 82 |
+
|
| 83 |
# Start response handler (Task 13)
|
| 84 |
self.response_task = asyncio.create_task(self._handle_responses())
|
| 85 |
+
|
| 86 |
except Exception as e:
|
| 87 |
print(f"β Failed to connect to GenAI: {e}")
|
| 88 |
self.connected = False
|
| 89 |
+
|
| 90 |
async def _handle_responses(self):
|
| 91 |
"""Handle AI responses as per Task 12-13"""
|
| 92 |
try:
|
|
|
|
| 96 |
try:
|
| 97 |
# Get the next message from the session
|
| 98 |
response_stream = self.session.receive()
|
| 99 |
+
|
| 100 |
# Check if this is an async iterator or needs to be awaited
|
| 101 |
if hasattr(response_stream, '__aiter__'):
|
| 102 |
# It's an async iterator
|
| 103 |
async for msg in response_stream:
|
| 104 |
if not self.connected:
|
| 105 |
break
|
| 106 |
+
|
| 107 |
if msg.data: # Audio response
|
| 108 |
# Convert to numpy for FastRTC (Task 13)
|
| 109 |
audio_array = np.frombuffer(msg.data, dtype=np.int16)
|
| 110 |
if len(audio_array) > 0:
|
| 111 |
audio_array = audio_array.reshape(1, -1)
|
| 112 |
await self.audio_queue.put(audio_array)
|
| 113 |
+
|
| 114 |
if msg.text: # Text response
|
| 115 |
print(f"π€ AI: {msg.text}")
|
| 116 |
else:
|
|
|
|
| 122 |
if len(audio_array) > 0:
|
| 123 |
audio_array = audio_array.reshape(1, -1)
|
| 124 |
await self.audio_queue.put(audio_array)
|
| 125 |
+
|
| 126 |
if msg.text: # Text response
|
| 127 |
print(f"π€ AI: {msg.text}")
|
| 128 |
+
|
| 129 |
except Exception as inner_e:
|
| 130 |
if "connection" in str(inner_e).lower() or "closed" in str(inner_e).lower():
|
| 131 |
print("π΄ Connection closed, stopping response handler")
|
|
|
|
| 133 |
else:
|
| 134 |
print(f"β οΈ Response handling error: {inner_e}")
|
| 135 |
await asyncio.sleep(0.1) # Brief pause before retry
|
| 136 |
+
|
| 137 |
except Exception as e:
|
| 138 |
print(f"β Error handling AI responses: {e}")
|
| 139 |
+
|
| 140 |
async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]):
|
| 141 |
"""Handle microphone audio (Task 11)"""
|
| 142 |
if not self.connected or not self.session:
|
| 143 |
return
|
| 144 |
+
|
| 145 |
try:
|
| 146 |
_, audio_np = frame
|
| 147 |
audio_bytes = audio_np.tobytes()
|
| 148 |
+
|
| 149 |
# Send audio to GenAI Live API using new non-deprecated method
|
| 150 |
await self.session.send_realtime_input(
|
| 151 |
input=types.Blob(
|
| 152 |
+
data=audio_bytes,
|
| 153 |
mime_type="audio/pcm;rate=16000"
|
| 154 |
)
|
| 155 |
)
|
| 156 |
except Exception as e:
|
| 157 |
print(f"β Error sending audio: {e}")
|
| 158 |
+
|
| 159 |
async def video_receive(self, frame: npt.NDArray[np.float32]):
|
| 160 |
"""Handle screen video frames (Task 11-12)"""
|
| 161 |
if not self.connected or not self.session:
|
| 162 |
return
|
| 163 |
+
|
| 164 |
try:
|
| 165 |
# Throttle to 1 FPS as per instructions
|
| 166 |
current_time = time.time()
|
| 167 |
if current_time - self.last_frame_time < self.frame_interval:
|
| 168 |
return
|
| 169 |
+
|
| 170 |
self.last_frame_time = current_time
|
| 171 |
+
|
| 172 |
# Convert float32 frame to uint8 for JPEG encoding
|
| 173 |
if frame.dtype == np.float32:
|
| 174 |
# Assuming frame is in range [0, 1], convert to [0, 255]
|
| 175 |
frame_uint8 = (frame * 255).astype(np.uint8)
|
| 176 |
else:
|
| 177 |
frame_uint8 = frame.astype(np.uint8)
|
| 178 |
+
|
| 179 |
# Check for empty frame before encoding
|
| 180 |
if frame_uint8.size == 0 or frame_uint8.shape[0] == 0 or frame_uint8.shape[1] == 0:
|
| 181 |
return
|
| 182 |
+
|
| 183 |
# Encode as JPEG (Task 12)
|
| 184 |
try:
|
| 185 |
success, jpg_bytes = cv2.imencode('.jpg', frame_uint8, [cv2.IMWRITE_JPEG_QUALITY, 80])
|
|
|
|
| 188 |
except cv2.error:
|
| 189 |
# Handle OpenCV encoding errors gracefully
|
| 190 |
return
|
| 191 |
+
|
| 192 |
# Send to GenAI using new non-deprecated method
|
| 193 |
await self.session.send_realtime_input(
|
| 194 |
input=types.Blob(
|
| 195 |
+
data=jpg_bytes.tobytes(),
|
| 196 |
mime_type="image/jpeg"
|
| 197 |
)
|
| 198 |
)
|
| 199 |
+
|
| 200 |
except Exception as e:
|
| 201 |
print(f"β Error sending video frame: {e}")
|
| 202 |
+
|
| 203 |
async def emit(self):
|
| 204 |
"""Emit audio back to user (Task 13)"""
|
| 205 |
try:
|
|
|
|
| 207 |
return (24000, audio_chunk)
|
| 208 |
except asyncio.QueueEmpty:
|
| 209 |
return None
|
| 210 |
+
|
| 211 |
def copy(self):
|
| 212 |
"""Copy method required by FastRTC AsyncAudioVideoStreamHandler"""
|
| 213 |
# Return a new instance with same configuration
|
| 214 |
new_instance = RealTimeScreenAssistant()
|
| 215 |
new_instance.frame_interval = self.frame_interval
|
| 216 |
return new_instance
|
| 217 |
+
|
| 218 |
async def video_emit(self):
|
| 219 |
"""Video emit method required by FastRTC AsyncAudioVideoStreamHandler"""
|
| 220 |
# For this use case, we don't emit video back to user
|
| 221 |
return None
|
| 222 |
+
|
| 223 |
async def shutdown(self):
|
| 224 |
"""Clean shutdown (Task 17)"""
|
| 225 |
self.connected = False
|
| 226 |
+
|
| 227 |
# Cancel response handler task if it exists
|
| 228 |
if hasattr(self, 'response_task') and not self.response_task.done():
|
| 229 |
self.response_task.cancel()
|
|
|
|
| 231 |
await self.response_task
|
| 232 |
except asyncio.CancelledError:
|
| 233 |
pass
|
| 234 |
+
|
| 235 |
if self.session:
|
| 236 |
try:
|
| 237 |
# Properly close the session using context manager
|
|
|
|
| 242 |
print("π΄ Disconnected from GenAI Live API")
|
| 243 |
except Exception as e:
|
| 244 |
print(f"β Error during shutdown: {e}")
|
| 245 |
+
|
| 246 |
self.session = None
|
| 247 |
if hasattr(self, 'session_context'):
|
| 248 |
self.session_context = None
|
|
|
|
| 256 |
# Create handler
|
| 257 |
handler = RealTimeScreenAssistant()
|
| 258 |
app_state["handler"] = handler
|
| 259 |
+
|
| 260 |
# Create stream with Cloudflare TURN (Task 22-23)
|
| 261 |
stream = Stream(
|
| 262 |
handler=ReplyOnPause(handler), # Voice activity detection (Task 3)
|
| 263 |
+
modality="audio-video",
|
| 264 |
mode="send-receive",
|
| 265 |
rtc_configuration=get_cloudflare_turn_credentials_async,
|
| 266 |
time_limit=300 # 5 minute limit for Spaces
|
| 267 |
)
|
| 268 |
+
|
| 269 |
app_state["stream"] = stream
|
| 270 |
return stream
|
| 271 |
+
|
| 272 |
except Exception as e:
|
| 273 |
print(f"β Error creating stream: {e}")
|
| 274 |
return None
|
|
|
|
| 279 |
current_api_key = os.getenv("GEMINI_API_KEY", "")
|
| 280 |
if not current_api_key:
|
| 281 |
return "β Please set GEMINI_API_KEY environment variable"
|
| 282 |
+
|
| 283 |
if app_state["connected"]:
|
| 284 |
return "β
Already connected - session is active"
|
| 285 |
+
|
| 286 |
app_state["connected"] = True
|
| 287 |
return "β
Connecting... Please allow microphone and camera permissions"
|
| 288 |
|
|
|
|
| 292 |
asyncio.create_task(app_state["handler"].shutdown())
|
| 293 |
app_state["connected"] = False
|
| 294 |
return "π΄ Disconnected from AI assistant"
|
| 295 |
+
|
| 296 |
return "Already disconnected"
|
| 297 |
|
| 298 |
# Screen sharing JavaScript - Fixed syntax for HF Spaces
|
|
|
|
| 302 |
if (!navigator.mediaDevices || !navigator.mediaDevices.getDisplayMedia) {
|
| 303 |
return "β Screen sharing not supported in this browser";
|
| 304 |
}
|
| 305 |
+
|
| 306 |
const stream = await navigator.mediaDevices.getDisplayMedia({
|
| 307 |
video: {
|
| 308 |
width: { ideal: 1920 },
|
|
|
|
| 310 |
},
|
| 311 |
audio: false
|
| 312 |
});
|
| 313 |
+
|
| 314 |
// Find the video element from FastRTC
|
| 315 |
const videos = document.querySelectorAll('video');
|
| 316 |
let targetVideo = null;
|
| 317 |
+
|
| 318 |
for (let video of videos) {
|
| 319 |
if (video.srcObject && video.srcObject.getVideoTracks().length > 0) {
|
| 320 |
targetVideo = video;
|
| 321 |
break;
|
| 322 |
}
|
| 323 |
}
|
| 324 |
+
|
| 325 |
if (targetVideo && targetVideo.srcObject) {
|
| 326 |
// Replace the camera track with screen track
|
| 327 |
const screenTrack = stream.getVideoTracks()[0];
|
| 328 |
const sender = targetVideo.srcObject.getVideoTracks()[0];
|
| 329 |
+
|
| 330 |
// Remove old track and add screen track
|
| 331 |
targetVideo.srcObject.removeTrack(sender);
|
| 332 |
targetVideo.srcObject.addTrack(screenTrack);
|
| 333 |
+
|
| 334 |
screenTrack.onended = () => {
|
| 335 |
console.log("Screen sharing ended");
|
| 336 |
};
|
| 337 |
+
|
| 338 |
return "π₯οΈ Screen sharing started successfully!";
|
| 339 |
} else {
|
| 340 |
return "β Could not find video stream to replace";
|
| 341 |
}
|
| 342 |
+
|
| 343 |
} catch (error) {
|
| 344 |
console.error("Screen sharing error:", error);
|
| 345 |
if (error.name === "NotAllowedError") {
|
|
|
|
| 354 |
|
| 355 |
def create_interface():
|
| 356 |
"""Create main interface (Task 26-30)"""
|
|
|
|
| 357 |
# Initialize stream
|
| 358 |
stream = initialize_real_time_assistant()
|
| 359 |
+
|
| 360 |
with gr.Blocks(
|
| 361 |
+
title="Real-Time Screen Assistant",
|
| 362 |
theme=gr.themes.Soft()
|
| 363 |
) as demo:
|
| 364 |
+
|
| 365 |
gr.Markdown("# π₯οΈ Real-Time Screen Assistant")
|
| 366 |
gr.Markdown("""
|
| 367 |
**π― LIVE AI that sees your screen and provides real-time guidance!**
|
| 368 |
+
|
| 369 |
**How it works:**
|
| 370 |
1. **Connect** - Links to Google's GenAI Live API for real-time AI processing
|
| 371 |
+
2. **Share Screen** - AI can see exactly what you're doing on your screen
|
| 372 |
3. **Voice Chat** - Talk naturally, AI responds with voice and sees everything
|
| 373 |
4. **Get Help** - Real-time assistance with software, coding, troubleshooting
|
| 374 |
+
|
| 375 |
**Tech Stack:**
|
| 376 |
- π§ Google GenAI Live API (multimodal real-time AI)
|
| 377 |
- πΉ FastRTC (low-latency screen/audio streaming)
|
| 378 |
+
- ποΈ Voice activity detection
|
| 379 |
- π Cloudflare TURN servers (HF Spaces optimized)
|
| 380 |
""")
|
| 381 |
+
|
| 382 |
# Status display
|
| 383 |
status_display = gr.Textbox(
|
| 384 |
label="Status",
|
| 385 |
value="Ready to connect - Click Connect to start real-time session",
|
| 386 |
interactive=False
|
| 387 |
)
|
| 388 |
+
|
| 389 |
# Control buttons (Task 3, 16-17)
|
| 390 |
with gr.Row():
|
| 391 |
connect_btn = gr.Button("π Connect", variant="primary")
|
| 392 |
mic_btn = gr.Button("ποΈ Test Microphone", variant="secondary")
|
| 393 |
+
screen_btn = gr.Button("π₯οΈ Show Your Screen", variant="secondary")
|
| 394 |
disconnect_btn = gr.Button("π΄ Disconnect", variant="stop")
|
| 395 |
+
|
| 396 |
# Stream interface - FastRTC UI for microphone and video
|
| 397 |
gr.Markdown("### π‘ Live Audio/Video Stream")
|
| 398 |
if stream:
|
|
|
|
| 405 |
""")
|
| 406 |
else:
|
| 407 |
gr.HTML("<div>β οΈ Stream initialization failed - Check console for errors</div>")
|
| 408 |
+
|
| 409 |
# Microphone activation JavaScript
|
| 410 |
microphone_js = '''
|
| 411 |
(async function() {
|
|
|
|
| 431 |
}
|
| 432 |
})()
|
| 433 |
'''
|
| 434 |
+
|
| 435 |
# Instructions (Task 1-3)
|
| 436 |
with gr.Accordion("π Instructions", open=True):
|
| 437 |
gr.Markdown("""
|
| 438 |
**How to use the real-time assistant:**
|
| 439 |
+
|
| 440 |
1. **Connect**: Click Connect to start the AI session
|
| 441 |
2. **Permissions**: Allow microphone and camera access
|
| 442 |
3. **Show Screen**: Click "Show Your Screen" to share your screen
|
| 443 |
4. **Voice Interaction**: Simply speak - the AI will respond
|
| 444 |
5. **Real-time Guidance**: AI sees your screen and provides live help
|
| 445 |
6. **Disconnect**: Click Disconnect when finished
|
| 446 |
+
|
| 447 |
**Features implemented from refactoring instructions:**
|
| 448 |
- β
FastRTC WebRTC streaming (Task 2)
|
| 449 |
- β
Google GenAI Live API integration (Task 7-15)
|
|
|
|
| 453 |
- β
Real-time advice generation (Task 18-21)
|
| 454 |
- β
Cloudflare TURN for HF Spaces (Task 22-23)
|
| 455 |
""")
|
| 456 |
+
|
| 457 |
# Privacy notice (Task 24-25)
|
| 458 |
with gr.Accordion("π Privacy & Security", open=False):
|
| 459 |
gr.Markdown("""
|
|
|
|
| 462 |
- Data is transmitted securely via encrypted WebRTC connections
|
| 463 |
- No permanent storage - all processing is real-time
|
| 464 |
- You control what is shared and can disconnect anytime
|
| 465 |
+
|
| 466 |
**Technical Details:**
|
| 467 |
- Uses Google Gemini Live API for real-time multimodal processing
|
| 468 |
- FastRTC provides low-latency WebRTC streaming
|
| 469 |
- Cloudflare TURN servers ensure reliable connectivity on HF Spaces
|
| 470 |
- Voice activity detection prevents interruptions
|
| 471 |
""")
|
| 472 |
+
|
| 473 |
# Wire up controls
|
| 474 |
connect_btn.click(
|
| 475 |
fn=handle_connect,
|
| 476 |
outputs=[status_display]
|
| 477 |
)
|
| 478 |
+
|
| 479 |
mic_btn.click(
|
| 480 |
fn=lambda: "ποΈ Testing microphone...",
|
| 481 |
outputs=[status_display],
|
| 482 |
js=microphone_js
|
| 483 |
)
|
| 484 |
+
|
| 485 |
screen_btn.click(
|
| 486 |
fn=lambda: "π₯οΈ Requesting screen share...",
|
| 487 |
outputs=[status_display],
|
| 488 |
js=screen_share_js
|
| 489 |
)
|
| 490 |
+
|
| 491 |
disconnect_btn.click(
|
| 492 |
fn=handle_disconnect,
|
| 493 |
outputs=[status_display]
|
| 494 |
)
|
| 495 |
+
|
| 496 |
return demo
|
| 497 |
|
| 498 |
# Main execution
|
|
|
|
| 501 |
print("=" * 50)
|
| 502 |
print("Refactored according to instructions for:")
|
| 503 |
print("- Google GenAI Live API integration")
|
| 504 |
+
print("- FastRTC real-time streaming")
|
| 505 |
print("- Voice activity detection")
|
| 506 |
print("- Screen sharing capabilities")
|
| 507 |
print("- Cloudflare TURN for HF Spaces")
|
| 508 |
+
|
| 509 |
if not API_KEY:
|
| 510 |
print("\nβ οΈ No GEMINI_API_KEY environment variable found")
|
| 511 |
print("Please set your Google AI API key:")
|
| 512 |
print("export GEMINI_API_KEY='your-api-key-here'")
|
| 513 |
else:
|
| 514 |
print(f"\nβ
API key configured (length: {len(API_KEY)})")
|
| 515 |
+
|
| 516 |
print("\nπ Starting real-time assistant...")
|
| 517 |
+
|
| 518 |
try:
|
| 519 |
demo = create_interface()
|
| 520 |
demo.launch(
|
|
|
|
| 525 |
)
|
| 526 |
except Exception as e:
|
| 527 |
print(f"β Failed to launch: {e}")
|
| 528 |
+
print("Ensure all dependencies are installed: pip install -r requirements.txt")
|