Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,193 +1,476 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import os
|
|
|
4 |
import time
|
5 |
-
from io import BytesIO
|
6 |
-
|
7 |
-
import gradio as gr
|
8 |
import numpy as np
|
9 |
-
import
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
Stream,
|
14 |
-
WebRTC,
|
15 |
-
get_cloudflare_turn_credentials_async,
|
16 |
-
wait_for_item,
|
17 |
-
)
|
18 |
from google import genai
|
19 |
-
from
|
20 |
-
from PIL import Image
|
21 |
-
|
22 |
-
load_dotenv()
|
23 |
-
|
24 |
-
|
25 |
-
def encode_audio(data: np.ndarray) -> dict:
|
26 |
-
"""Encode Audio data to send to the server"""
|
27 |
-
return {
|
28 |
-
"mime_type": "audio/pcm",
|
29 |
-
"data": base64.b64encode(data.tobytes()).decode("UTF-8"),
|
30 |
-
}
|
31 |
|
|
|
|
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
)
|
46 |
super().__init__(
|
47 |
-
"mono",
|
48 |
-
output_sample_rate=24000,
|
49 |
-
input_sample_rate=16000
|
50 |
)
|
51 |
-
self.audio_queue = asyncio.Queue()
|
52 |
-
self.video_queue = asyncio.Queue()
|
53 |
self.session = None
|
54 |
self.last_frame_time = 0
|
55 |
-
self.
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
async def start_up(self):
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
try:
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
async def video_emit(self):
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
if self.session:
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
if array is not None:
|
110 |
-
return (self.output_sample_rate, array)
|
111 |
-
return array
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
-
|
121 |
-
handler
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
],
|
129 |
-
ui_args={
|
130 |
-
"icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
|
131 |
-
"pulse_color": "rgb(255, 255, 255)",
|
132 |
-
"icon_button_color": "rgb(255, 255, 255)",
|
133 |
-
"title": "Gemini Audio Video Chat",
|
134 |
-
},
|
135 |
-
)
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
"""
|
144 |
-
<div style='display: flex; align-items: center; justify-content: center; gap: 20px'>
|
145 |
-
<div style="background-color: var(--block-background-fill); border-radius: 8px">
|
146 |
-
<img src="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png" style="width: 100px; height: 100px;">
|
147 |
-
</div>
|
148 |
-
<div>
|
149 |
-
<h1>Gen AI SDK Voice Chat</h1>
|
150 |
-
<p>Speak with Gemini using real-time audio + video streaming</p>
|
151 |
-
<p>Powered by <a href="https://gradio.app/">Gradio</a> and <a href=https://freddyaboulton.github.io/gradio-webrtc/">WebRTC</a>β‘οΈ</p>
|
152 |
-
<p>Get an API Key <a href="https://support.google.com/googleapi/answer/6158862?hl=en">here</a></p>
|
153 |
-
</div>
|
154 |
-
</div>
|
155 |
-
"""
|
156 |
-
)
|
157 |
-
with gr.Row() as row:
|
158 |
-
with gr.Column():
|
159 |
-
webrtc = WebRTC(
|
160 |
-
label="Video Chat",
|
161 |
-
modality="audio-video",
|
162 |
-
mode="send-receive",
|
163 |
-
elem_id="video-source",
|
164 |
-
rtc_configuration=get_cloudflare_turn_credentials_async
|
165 |
-
if get_space()
|
166 |
-
else None,
|
167 |
-
icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
|
168 |
-
pulse_color="rgb(255, 255, 255)",
|
169 |
-
icon_button_color="rgb(255, 255, 255)",
|
170 |
-
)
|
171 |
-
with gr.Column():
|
172 |
-
image_input = gr.Image(
|
173 |
-
label="Image", type="numpy", sources=["upload", "clipboard"]
|
174 |
-
)
|
175 |
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
if __name__ == "__main__":
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
else:
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Real-Time Screen Assistant - Refactored for Google GenAI Live API + FastRTC
|
3 |
+
|
4 |
+
This application transforms the original screenshot analyzer into a real-time
|
5 |
+
screen sharing assistant with voice interaction, following the refactoring
|
6 |
+
instructions for live streaming capabilities.
|
7 |
+
"""
|
8 |
+
|
9 |
import os
|
10 |
+
import asyncio
|
11 |
import time
|
|
|
|
|
|
|
12 |
import numpy as np
|
13 |
+
import numpy.typing as npt
|
14 |
+
import cv2
|
15 |
+
import gradio as gr
|
16 |
+
from fastrtc import Stream, AsyncAudioVideoStreamHandler, get_cloudflare_turn_credentials_async, ReplyOnPause
|
|
|
|
|
|
|
|
|
|
|
17 |
from google import genai
|
18 |
+
from google.genai import types
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
# Environment variable for API key
|
21 |
+
API_KEY = os.getenv("GEMINI_API_KEY", "")
|
22 |
|
23 |
+
class RealTimeScreenAssistant(AsyncAudioVideoStreamHandler):
|
24 |
+
"""
|
25 |
+
Real-time screen assistant implementing the refactoring instructions.
|
26 |
+
|
27 |
+
Features:
|
28 |
+
- Google GenAI Live API integration
|
29 |
+
- Real-time audio/video streaming via FastRTC
|
30 |
+
- Voice activity detection with ReplyOnPause
|
31 |
+
- Intelligent frame sampling for screen sharing
|
32 |
+
- Cloudflare TURN server support for HF Spaces
|
33 |
+
"""
|
34 |
+
|
35 |
+
def __init__(self):
|
36 |
super().__init__(
|
37 |
+
expected_layout="mono",
|
38 |
+
output_sample_rate=24000,
|
39 |
+
input_sample_rate=16000
|
40 |
)
|
|
|
|
|
41 |
self.session = None
|
42 |
self.last_frame_time = 0
|
43 |
+
self.audio_queue = asyncio.Queue()
|
44 |
+
self.connected = False
|
45 |
+
self.frame_interval = 1.0 # 1 FPS as per instructions
|
46 |
+
|
|
|
47 |
async def start_up(self):
|
48 |
+
"""Initialize Google GenAI Live session as per Task 8-10"""
|
49 |
+
try:
|
50 |
+
# Re-check environment variable in case it was set after import
|
51 |
+
current_api_key = os.getenv("GEMINI_API_KEY", "")
|
52 |
+
if not current_api_key:
|
53 |
+
print("β No GEMINI_API_KEY found in environment")
|
54 |
+
return
|
55 |
+
|
56 |
+
# Initialize client with v1alpha API (Task 8)
|
57 |
+
client = genai.Client(
|
58 |
+
api_key=current_api_key,
|
59 |
+
http_options={"api_version": "v1alpha"}
|
60 |
+
)
|
61 |
+
|
62 |
+
# Configure live session (Task 9) - minimal working config
|
63 |
+
from google.genai.types import LiveConnectConfig
|
64 |
+
|
65 |
+
# Start with minimal config to avoid WebSocket errors
|
66 |
+
config = LiveConnectConfig(
|
67 |
+
system_instruction=(
|
68 |
+
"You are a helpful real-time assistant who watches the user's screen and provides "
|
69 |
+
"guidance on using software. Give clear, step-by-step instructions based on what "
|
70 |
+
"you see and hear. Be proactive in offering assistance."
|
71 |
+
)
|
72 |
+
)
|
73 |
+
|
74 |
+
# Connect to Live API (Task 10) - using async context manager
|
75 |
+
self.session_context = client.aio.live.connect(
|
76 |
+
model="gemini-2.0-flash-live-001",
|
77 |
+
config=config
|
78 |
+
)
|
79 |
+
self.session = await self.session_context.__aenter__()
|
80 |
+
|
81 |
+
self.connected = True
|
82 |
+
print("β
Connected to Google GenAI Live API")
|
83 |
+
|
84 |
+
# Start response handler (Task 13)
|
85 |
+
self.response_task = asyncio.create_task(self._handle_responses())
|
86 |
+
|
87 |
+
except Exception as e:
|
88 |
+
print(f"β Failed to connect to GenAI: {e}")
|
89 |
+
self.connected = False
|
90 |
+
|
91 |
+
async def _handle_responses(self):
|
92 |
+
"""Handle AI responses as per Task 12-13"""
|
93 |
+
try:
|
94 |
+
# The Google GenAI Live API provides an async iterator through session.receive()
|
95 |
+
# We need to handle this as a streaming response
|
96 |
+
while self.connected and self.session:
|
97 |
try:
|
98 |
+
# Get the next message from the session
|
99 |
+
response_stream = self.session.receive()
|
100 |
+
|
101 |
+
# Check if this is an async iterator or needs to be awaited
|
102 |
+
if hasattr(response_stream, '__aiter__'):
|
103 |
+
# It's an async iterator
|
104 |
+
async for msg in response_stream:
|
105 |
+
if not self.connected:
|
106 |
+
break
|
107 |
+
|
108 |
+
if msg.data: # Audio response
|
109 |
+
# Convert to numpy for FastRTC (Task 13)
|
110 |
+
audio_array = np.frombuffer(msg.data, dtype=np.int16)
|
111 |
+
if len(audio_array) > 0:
|
112 |
+
audio_array = audio_array.reshape(1, -1)
|
113 |
+
await self.audio_queue.put(audio_array)
|
114 |
+
|
115 |
+
if msg.text: # Text response
|
116 |
+
print(f"π€ AI: {msg.text}")
|
117 |
+
else:
|
118 |
+
# It's a single response that needs to be awaited
|
119 |
+
msg = await response_stream
|
120 |
+
if msg:
|
121 |
+
if msg.data: # Audio response
|
122 |
+
audio_array = np.frombuffer(msg.data, dtype=np.int16)
|
123 |
+
if len(audio_array) > 0:
|
124 |
+
audio_array = audio_array.reshape(1, -1)
|
125 |
+
await self.audio_queue.put(audio_array)
|
126 |
+
|
127 |
+
if msg.text: # Text response
|
128 |
+
print(f"π€ AI: {msg.text}")
|
129 |
+
|
130 |
+
except Exception as inner_e:
|
131 |
+
if "connection" in str(inner_e).lower() or "closed" in str(inner_e).lower():
|
132 |
+
print("π΄ Connection closed, stopping response handler")
|
133 |
+
break
|
134 |
+
else:
|
135 |
+
print(f"β οΈ Response handling error: {inner_e}")
|
136 |
+
await asyncio.sleep(0.1) # Brief pause before retry
|
137 |
+
|
138 |
+
except Exception as e:
|
139 |
+
print(f"β Error handling AI responses: {e}")
|
140 |
+
|
141 |
+
async def receive(self, frame: tuple[int, npt.NDArray[np.int16]]):
|
142 |
+
"""Handle microphone audio (Task 11)"""
|
143 |
+
if not self.connected or not self.session:
|
144 |
+
return
|
145 |
+
|
146 |
+
try:
|
147 |
+
_, audio_np = frame
|
148 |
+
audio_bytes = audio_np.tobytes()
|
149 |
+
|
150 |
+
# Send audio to GenAI Live API using new non-deprecated method
|
151 |
+
await self.session.send_realtime_input(
|
152 |
+
input=types.Blob(
|
153 |
+
data=audio_bytes,
|
154 |
+
mime_type="audio/pcm;rate=16000"
|
155 |
+
)
|
156 |
+
)
|
157 |
+
except Exception as e:
|
158 |
+
print(f"β Error sending audio: {e}")
|
159 |
+
|
160 |
+
async def video_receive(self, frame: npt.NDArray[np.float32]):
|
161 |
+
"""Handle screen video frames (Task 11-12)"""
|
162 |
+
if not self.connected or not self.session:
|
163 |
+
return
|
164 |
+
|
165 |
+
try:
|
166 |
+
# Throttle to 1 FPS as per instructions
|
167 |
+
current_time = time.time()
|
168 |
+
if current_time - self.last_frame_time < self.frame_interval:
|
169 |
+
return
|
170 |
+
|
171 |
+
self.last_frame_time = current_time
|
172 |
+
|
173 |
+
# Convert float32 frame to uint8 for JPEG encoding
|
174 |
+
if frame.dtype == np.float32:
|
175 |
+
# Assuming frame is in range [0, 1], convert to [0, 255]
|
176 |
+
frame_uint8 = (frame * 255).astype(np.uint8)
|
177 |
+
else:
|
178 |
+
frame_uint8 = frame.astype(np.uint8)
|
179 |
+
|
180 |
+
# Check for empty frame before encoding
|
181 |
+
if frame_uint8.size == 0 or frame_uint8.shape[0] == 0 or frame_uint8.shape[1] == 0:
|
182 |
+
return
|
183 |
+
|
184 |
+
# Encode as JPEG (Task 12)
|
185 |
+
try:
|
186 |
+
success, jpg_bytes = cv2.imencode('.jpg', frame_uint8, [cv2.IMWRITE_JPEG_QUALITY, 80])
|
187 |
+
if not success:
|
188 |
+
return
|
189 |
+
except cv2.error:
|
190 |
+
# Handle OpenCV encoding errors gracefully
|
191 |
+
return
|
192 |
+
|
193 |
+
# Send to GenAI using new non-deprecated method
|
194 |
+
await self.session.send_realtime_input(
|
195 |
+
input=types.Blob(
|
196 |
+
data=jpg_bytes.tobytes(),
|
197 |
+
mime_type="image/jpeg"
|
198 |
+
)
|
199 |
+
)
|
200 |
+
|
201 |
+
except Exception as e:
|
202 |
+
print(f"β Error sending video frame: {e}")
|
203 |
+
|
204 |
+
async def emit(self):
|
205 |
+
"""Emit audio back to user (Task 13)"""
|
206 |
+
try:
|
207 |
+
audio_chunk = self.audio_queue.get_nowait()
|
208 |
+
return (24000, audio_chunk)
|
209 |
+
except asyncio.QueueEmpty:
|
210 |
+
return None
|
211 |
+
|
212 |
+
def copy(self):
|
213 |
+
"""Copy method required by FastRTC AsyncAudioVideoStreamHandler"""
|
214 |
+
# Return a new instance with same configuration
|
215 |
+
new_instance = RealTimeScreenAssistant()
|
216 |
+
new_instance.frame_interval = self.frame_interval
|
217 |
+
return new_instance
|
218 |
+
|
219 |
async def video_emit(self):
|
220 |
+
"""Video emit method required by FastRTC AsyncAudioVideoStreamHandler"""
|
221 |
+
# For this use case, we don't emit video back to user
|
222 |
+
return None
|
223 |
+
|
224 |
+
async def shutdown(self):
|
225 |
+
"""Clean shutdown (Task 17)"""
|
226 |
+
self.connected = False
|
227 |
+
|
228 |
+
# Cancel response handler task if it exists
|
229 |
+
if hasattr(self, 'response_task') and not self.response_task.done():
|
230 |
+
self.response_task.cancel()
|
231 |
+
try:
|
232 |
+
await self.response_task
|
233 |
+
except asyncio.CancelledError:
|
234 |
+
pass
|
235 |
+
|
236 |
if self.session:
|
237 |
+
try:
|
238 |
+
# Properly close the session using context manager
|
239 |
+
if hasattr(self, 'session_context'):
|
240 |
+
await self.session_context.__aexit__(None, None, None)
|
241 |
+
else:
|
242 |
+
await self.session.close()
|
243 |
+
print("π΄ Disconnected from GenAI Live API")
|
244 |
+
except Exception as e:
|
245 |
+
print(f"β Error during shutdown: {e}")
|
246 |
+
|
247 |
+
self.session = None
|
248 |
+
if hasattr(self, 'session_context'):
|
249 |
+
self.session_context = None
|
250 |
|
251 |
+
# Global state
|
252 |
+
app_state = {"stream": None, "handler": None, "connected": False}
|
|
|
|
|
|
|
253 |
|
254 |
+
def initialize_real_time_assistant():
|
255 |
+
"""Initialize the real-time assistant (Task 26-29)"""
|
256 |
+
try:
|
257 |
+
# Create handler
|
258 |
+
handler = RealTimeScreenAssistant()
|
259 |
+
app_state["handler"] = handler
|
260 |
+
|
261 |
+
# Create stream with Cloudflare TURN (Task 22-23)
|
262 |
+
stream = Stream(
|
263 |
+
handler=ReplyOnPause(handler), # Voice activity detection (Task 3)
|
264 |
+
modality="audio-video",
|
265 |
+
mode="send-receive",
|
266 |
+
rtc_configuration=get_cloudflare_turn_credentials_async,
|
267 |
+
time_limit=300 # 5 minute limit for Spaces
|
268 |
+
)
|
269 |
+
|
270 |
+
app_state["stream"] = stream
|
271 |
+
return stream
|
272 |
+
|
273 |
+
except Exception as e:
|
274 |
+
print(f"β Error creating stream: {e}")
|
275 |
+
return None
|
276 |
|
277 |
+
def handle_connect():
|
278 |
+
"""Connect button handler (Task 16)"""
|
279 |
+
# Re-check environment variable in case it was set after import
|
280 |
+
current_api_key = os.getenv("GEMINI_API_KEY", "")
|
281 |
+
if not current_api_key:
|
282 |
+
return "β Please set GEMINI_API_KEY environment variable"
|
283 |
+
|
284 |
+
if app_state["connected"]:
|
285 |
+
return "β
Already connected - session is active"
|
286 |
+
|
287 |
+
app_state["connected"] = True
|
288 |
+
return "β
Connecting... Please allow microphone and camera permissions"
|
289 |
|
290 |
+
def handle_disconnect():
|
291 |
+
"""Disconnect button handler (Task 17)"""
|
292 |
+
if app_state["handler"] and app_state["connected"]:
|
293 |
+
asyncio.create_task(app_state["handler"].shutdown())
|
294 |
+
app_state["connected"] = False
|
295 |
+
return "π΄ Disconnected from AI assistant"
|
296 |
+
|
297 |
+
return "Already disconnected"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
+
# Screen sharing JavaScript (Task 16)
|
300 |
+
screen_share_js = """
|
301 |
+
async function shareScreen() {
|
302 |
+
try {
|
303 |
+
const screenStream = await navigator.mediaDevices.getDisplayMedia({
|
304 |
+
video: {
|
305 |
+
mediaSource: 'screen',
|
306 |
+
width: { ideal: 1280 },
|
307 |
+
height: { ideal: 720 }
|
308 |
+
},
|
309 |
+
audio: false
|
310 |
+
});
|
311 |
+
|
312 |
+
const videoElements = document.querySelectorAll('video');
|
313 |
+
const webrtcVideo = Array.from(videoElements).find(video =>
|
314 |
+
video.srcObject && video.srcObject.getVideoTracks().length > 0
|
315 |
+
);
|
316 |
+
|
317 |
+
if (webrtcVideo && webrtcVideo.srcObject) {
|
318 |
+
const videoTrack = screenStream.getVideoTracks()[0];
|
319 |
+
const currentTrack = webrtcVideo.srcObject.getVideoTracks()[0];
|
320 |
+
|
321 |
+
webrtcVideo.srcObject.removeTrack(currentTrack);
|
322 |
+
webrtcVideo.srcObject.addTrack(videoTrack);
|
323 |
+
|
324 |
+
videoTrack.onended = () => {
|
325 |
+
console.log('Screen sharing ended');
|
326 |
+
};
|
327 |
+
|
328 |
+
return "π₯οΈ Screen sharing started";
|
329 |
+
} else {
|
330 |
+
return "β Could not find video element";
|
331 |
+
}
|
332 |
+
|
333 |
+
} catch (error) {
|
334 |
+
console.error('Screen sharing error:', error);
|
335 |
+
if (error.name === 'NotAllowedError') {
|
336 |
+
return "β Screen sharing permission denied";
|
337 |
+
} else {
|
338 |
+
return `β Screen sharing failed: ${error.message}`;
|
339 |
+
}
|
340 |
+
}
|
341 |
+
}
|
342 |
|
343 |
+
return shareScreen();
|
344 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
|
346 |
+
def create_interface():
|
347 |
+
"""Create main interface (Task 26-30)"""
|
348 |
+
|
349 |
+
# Initialize stream
|
350 |
+
stream = initialize_real_time_assistant()
|
351 |
+
|
352 |
+
with gr.Blocks(
|
353 |
+
title="Real-Time Screen Assistant",
|
354 |
+
theme=gr.themes.Soft()
|
355 |
+
) as demo:
|
356 |
+
|
357 |
+
gr.Markdown("# π₯οΈ Real-Time Screen Assistant")
|
358 |
+
gr.Markdown("""
|
359 |
+
**Live AI assistant with screen sharing and voice interaction**
|
360 |
+
|
361 |
+
Refactored according to instructions for real-time capabilities using:
|
362 |
+
- Google GenAI Live API for multimodal AI processing
|
363 |
+
- FastRTC for low-latency audio/video streaming
|
364 |
+
- Voice activity detection with ReplyOnPause
|
365 |
+
- Cloudflare TURN servers for HF Spaces compatibility
|
366 |
+
""")
|
367 |
+
|
368 |
+
# Status display
|
369 |
+
status_display = gr.Textbox(
|
370 |
+
label="Status",
|
371 |
+
value="Ready to connect - Click Connect to start real-time session",
|
372 |
+
interactive=False
|
373 |
)
|
374 |
+
|
375 |
+
# Control buttons (Task 3, 16-17)
|
376 |
+
with gr.Row():
|
377 |
+
connect_btn = gr.Button("π Connect", variant="primary")
|
378 |
+
screen_btn = gr.Button("π₯οΈ Show Your Screen", variant="secondary")
|
379 |
+
disconnect_btn = gr.Button("π΄ Disconnect", variant="stop")
|
380 |
+
|
381 |
+
# Stream interface
|
382 |
+
if stream and hasattr(stream, 'ui'):
|
383 |
+
gr.Markdown("### π‘ Live Stream")
|
384 |
+
# Mount the FastRTC stream UI
|
385 |
+
stream_ui = stream.ui
|
386 |
+
else:
|
387 |
+
stream_ui = gr.HTML("<div>β οΈ Stream initialization failed</div>")
|
388 |
+
|
389 |
+
# Instructions (Task 1-3)
|
390 |
+
with gr.Accordion("π Instructions", open=True):
|
391 |
+
gr.Markdown("""
|
392 |
+
**How to use the real-time assistant:**
|
393 |
+
|
394 |
+
1. **Connect**: Click Connect to start the AI session
|
395 |
+
2. **Permissions**: Allow microphone and camera access
|
396 |
+
3. **Show Screen**: Click "Show Your Screen" to share your screen
|
397 |
+
4. **Voice Interaction**: Simply speak - the AI will respond
|
398 |
+
5. **Real-time Guidance**: AI sees your screen and provides live help
|
399 |
+
6. **Disconnect**: Click Disconnect when finished
|
400 |
+
|
401 |
+
**Features implemented from refactoring instructions:**
|
402 |
+
- β
FastRTC WebRTC streaming (Task 2)
|
403 |
+
- β
Google GenAI Live API integration (Task 7-15)
|
404 |
+
- β
Connect/Show Screen/Disconnect controls (Task 3, 16-17)
|
405 |
+
- β
Voice activity detection with ReplyOnPause (Task 3)
|
406 |
+
- β
Screen sharing via getDisplayMedia (Task 6)
|
407 |
+
- β
Real-time advice generation (Task 18-21)
|
408 |
+
- β
Cloudflare TURN for HF Spaces (Task 22-23)
|
409 |
+
""")
|
410 |
+
|
411 |
+
# Privacy notice (Task 24-25)
|
412 |
+
with gr.Accordion("π Privacy & Security", open=False):
|
413 |
+
gr.Markdown("""
|
414 |
+
**Privacy Notice:**
|
415 |
+
- Screen content and voice are processed by Google's AI services
|
416 |
+
- Data is transmitted securely via encrypted WebRTC connections
|
417 |
+
- No permanent storage - all processing is real-time
|
418 |
+
- You control what is shared and can disconnect anytime
|
419 |
+
|
420 |
+
**Technical Details:**
|
421 |
+
- Uses Google Gemini Live API for real-time multimodal processing
|
422 |
+
- FastRTC provides low-latency WebRTC streaming
|
423 |
+
- Cloudflare TURN servers ensure reliable connectivity on HF Spaces
|
424 |
+
- Voice activity detection prevents interruptions
|
425 |
+
""")
|
426 |
+
|
427 |
+
# Wire up controls
|
428 |
+
connect_btn.click(
|
429 |
+
fn=handle_connect,
|
430 |
+
outputs=[status_display]
|
431 |
+
)
|
432 |
+
|
433 |
+
screen_btn.click(
|
434 |
+
fn=lambda: "π₯οΈ Requesting screen share...",
|
435 |
+
outputs=[status_display],
|
436 |
+
js=screen_share_js
|
437 |
+
)
|
438 |
+
|
439 |
+
disconnect_btn.click(
|
440 |
+
fn=handle_disconnect,
|
441 |
+
outputs=[status_display]
|
442 |
+
)
|
443 |
+
|
444 |
+
return demo
|
445 |
|
446 |
+
# Main execution
|
|
|
|
|
447 |
if __name__ == "__main__":
|
448 |
+
print("π₯οΈ Real-Time Screen Assistant")
|
449 |
+
print("=" * 50)
|
450 |
+
print("Refactored according to instructions for:")
|
451 |
+
print("- Google GenAI Live API integration")
|
452 |
+
print("- FastRTC real-time streaming")
|
453 |
+
print("- Voice activity detection")
|
454 |
+
print("- Screen sharing capabilities")
|
455 |
+
print("- Cloudflare TURN for HF Spaces")
|
456 |
+
|
457 |
+
if not API_KEY:
|
458 |
+
print("\nβ οΈ No GEMINI_API_KEY environment variable found")
|
459 |
+
print("Please set your Google AI API key:")
|
460 |
+
print("export GEMINI_API_KEY='your-api-key-here'")
|
461 |
else:
|
462 |
+
print(f"\nβ
API key configured (length: {len(API_KEY)})")
|
463 |
+
|
464 |
+
print("\nπ Starting real-time assistant...")
|
465 |
+
|
466 |
+
try:
|
467 |
+
demo = create_interface()
|
468 |
+
demo.launch(
|
469 |
+
server_name="0.0.0.0",
|
470 |
+
server_port=7860,
|
471 |
+
share=False,
|
472 |
+
show_error=True
|
473 |
+
)
|
474 |
+
except Exception as e:
|
475 |
+
print(f"β Failed to launch: {e}")
|
476 |
+
print("Ensure all dependencies are installed: pip install -r requirements.txt")
|