seawolf2357 commited on
Commit
7880cc5
·
verified ·
1 Parent(s): bac099b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +352 -50
app.py CHANGED
@@ -25,6 +25,39 @@ load_dotenv()
25
 
26
  SAMPLE_RATE = 24000
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # HTML content embedded as a string
29
  HTML_CONTENT = """<!DOCTYPE html>
30
  <html lang="ko">
@@ -87,14 +120,31 @@ HTML_CONTENT = """<!DOCTYPE html>
87
  font-size: 32px;
88
  letter-spacing: 1px;
89
  }
90
- /* Web search toggle */
91
- .search-toggle {
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  display: flex;
93
  align-items: center;
94
- justify-content: center;
95
  gap: 10px;
96
- margin-top: 15px;
97
  }
 
 
 
 
 
 
98
  .toggle-switch {
99
  position: relative;
100
  width: 50px;
@@ -120,9 +170,43 @@ HTML_CONTENT = """<!DOCTYPE html>
120
  .toggle-switch.active .toggle-slider {
121
  transform: translateX(24px);
122
  }
123
- .search-label {
 
 
 
 
 
 
124
  font-size: 14px;
125
- color: #aaa;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  }
127
  .chat-container {
128
  border-radius: 12px;
@@ -192,6 +276,7 @@ HTML_CONTENT = """<!DOCTYPE html>
192
  margin-top: 20px;
193
  display: flex;
194
  justify-content: center;
 
195
  flex-shrink: 0;
196
  }
197
  button {
@@ -220,6 +305,14 @@ HTML_CONTENT = """<!DOCTYPE html>
220
  button:active {
221
  transform: translateY(1px);
222
  }
 
 
 
 
 
 
 
 
223
  #audio-output {
224
  display: none;
225
  }
@@ -344,6 +437,11 @@ HTML_CONTENT = """<!DOCTYPE html>
344
  background-color: var(--secondary-color);
345
  border-radius: 50%;
346
  }
 
 
 
 
 
347
  </style>
348
  </head>
349
 
@@ -359,22 +457,71 @@ HTML_CONTENT = """<!DOCTYPE html>
359
  </div>
360
  <h1>MOUSE 음성 챗</h1>
361
  </div>
362
- <div class="search-toggle">
363
- <span class="search-label">웹 검색</span>
364
- <div id="search-toggle" class="toggle-switch">
365
- <div class="toggle-slider"></div>
366
- </div>
367
- </div>
368
  <div class="status-indicator">
369
  <div id="status-dot" class="status-dot disconnected"></div>
370
  <span id="status-text">연결 대기 중</span>
371
  </div>
372
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  <div class="chat-container">
374
  <div class="chat-messages" id="chat-messages"></div>
 
 
 
375
  </div>
376
  <div class="controls">
377
  <button id="start-button">대화 시작</button>
 
378
  </div>
379
  </div>
380
  <audio id="audio-output"></audio>
@@ -383,15 +530,22 @@ HTML_CONTENT = """<!DOCTYPE html>
383
  let peerConnection;
384
  let webrtc_id;
385
  let webSearchEnabled = false;
 
 
386
  const audioOutput = document.getElementById('audio-output');
387
  const startButton = document.getElementById('start-button');
 
388
  const chatMessages = document.getElementById('chat-messages');
389
  const statusDot = document.getElementById('status-dot');
390
  const statusText = document.getElementById('status-text');
391
  const searchToggle = document.getElementById('search-toggle');
 
 
 
392
  let audioLevel = 0;
393
  let animationFrame;
394
  let audioContext, analyser, audioSource;
 
395
 
396
  // Web search toggle functionality
397
  searchToggle.addEventListener('click', () => {
@@ -400,14 +554,50 @@ HTML_CONTENT = """<!DOCTYPE html>
400
  console.log('Web search enabled:', webSearchEnabled);
401
  });
402
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  function updateStatus(state) {
404
  statusDot.className = 'status-dot ' + state;
405
  if (state === 'connected') {
406
  statusText.textContent = '연결됨';
 
407
  } else if (state === 'connecting') {
408
  statusText.textContent = '연결 중...';
 
409
  } else {
410
  statusText.textContent = '연결 대기 중';
 
411
  }
412
  }
413
  function updateButtonState() {
@@ -508,13 +698,19 @@ HTML_CONTENT = """<!DOCTYPE html>
508
  audioOutput.play();
509
  }
510
  });
511
- const dataChannel = peerConnection.createDataChannel('text');
 
 
 
 
 
512
  dataChannel.onmessage = (event) => {
513
  const eventJson = JSON.parse(event.data);
514
  if (eventJson.type === "error") {
515
  showError(eventJson.message);
516
  }
517
  };
 
518
  const offer = await peerConnection.createOffer();
519
  await peerConnection.setLocalDescription(offer);
520
  await new Promise((resolve) => {
@@ -547,7 +743,9 @@ HTML_CONTENT = """<!DOCTYPE html>
547
  sdp: peerConnection.localDescription.sdp,
548
  type: peerConnection.localDescription.type,
549
  webrtc_id: webrtc_id,
550
- web_search_enabled: webSearchEnabled
 
 
551
  })
552
  });
553
  const serverResponse = await response.json();
@@ -562,7 +760,11 @@ HTML_CONTENT = """<!DOCTYPE html>
562
  const eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
563
  eventSource.addEventListener("output", (event) => {
564
  const eventJson = JSON.parse(event.data);
565
- addMessage("assistant", eventJson.content);
 
 
 
 
566
  });
567
  eventSource.addEventListener("search", (event) => {
568
  const eventJson = JSON.parse(event.data);
@@ -580,7 +782,11 @@ HTML_CONTENT = """<!DOCTYPE html>
580
  function addMessage(role, content) {
581
  const messageDiv = document.createElement('div');
582
  messageDiv.classList.add('message', role);
583
- messageDiv.textContent = content;
 
 
 
 
584
  chatMessages.appendChild(messageDiv);
585
  chatMessages.scrollTop = chatMessages.scrollHeight;
586
  }
@@ -610,6 +816,7 @@ HTML_CONTENT = """<!DOCTYPE html>
610
  console.log('closing');
611
  peerConnection.close();
612
  }
 
613
  updateButtonState();
614
  audioLevel = 0;
615
  }
@@ -675,8 +882,8 @@ brave_api_key = os.getenv("BSEARCH_API")
675
  search_client = BraveSearchClient(brave_api_key) if brave_api_key else None
676
  print(f"Search client initialized: {search_client is not None}, API key present: {bool(brave_api_key)}")
677
 
678
- # Store web search settings by connection
679
- web_search_settings = {}
680
 
681
  def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
682
  chatbot.append({"role": "assistant", "content": response.transcript})
@@ -684,7 +891,8 @@ def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEve
684
 
685
 
686
  class OpenAIHandler(AsyncStreamHandler):
687
- def __init__(self, web_search_enabled: bool = False, webrtc_id: str = None) -> None:
 
688
  super().__init__(
689
  expected_layout="mono",
690
  output_sample_rate=SAMPLE_RATE,
@@ -699,21 +907,27 @@ class OpenAIHandler(AsyncStreamHandler):
699
  self.current_call_id = None
700
  self.webrtc_id = webrtc_id
701
  self.web_search_enabled = web_search_enabled
702
- print(f"Handler created with web_search_enabled={web_search_enabled}, webrtc_id={webrtc_id}")
 
 
 
703
 
704
  def copy(self):
705
  # Get the most recent settings
706
- if web_search_settings:
707
  # Get the most recent webrtc_id
708
- recent_ids = sorted(web_search_settings.keys(),
709
- key=lambda k: web_search_settings[k].get('timestamp', 0),
710
  reverse=True)
711
  if recent_ids:
712
  recent_id = recent_ids[0]
713
- settings = web_search_settings[recent_id]
714
- web_search_enabled = settings.get('enabled', False)
715
- print(f"Handler.copy() using recent settings - webrtc_id={recent_id}, web_search_enabled={web_search_enabled}")
716
- return OpenAIHandler(web_search_enabled=web_search_enabled, webrtc_id=recent_id)
 
 
 
717
 
718
  print(f"Handler.copy() called - creating new handler with default settings")
719
  return OpenAIHandler(web_search_enabled=False)
@@ -739,26 +953,57 @@ class OpenAIHandler(AsyncStreamHandler):
739
 
740
  return f"웹 검색 결과 '{query}':\n\n" + "\n".join(formatted_results)
741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
  async def start_up(self):
743
  """Connect to realtime API with function calling enabled"""
744
  # First check if we have the most recent settings
745
- if web_search_settings:
746
- recent_ids = sorted(web_search_settings.keys(),
747
- key=lambda k: web_search_settings[k].get('timestamp', 0),
748
  reverse=True)
749
  if recent_ids:
750
  recent_id = recent_ids[0]
751
- settings = web_search_settings[recent_id]
752
- self.web_search_enabled = settings.get('enabled', False)
 
 
753
  self.webrtc_id = recent_id
754
- print(f"start_up: Updated settings from storage - webrtc_id={self.webrtc_id}, web_search_enabled={self.web_search_enabled}")
 
755
 
756
- print(f"Starting up handler with web_search_enabled={self.web_search_enabled}")
 
757
  self.client = openai.AsyncOpenAI()
758
 
759
  # Define the web search function
760
  tools = []
761
- instructions = "You are a helpful assistant. Respond in Korean when the user speaks Korean."
 
 
 
762
 
763
  if self.web_search_enabled and self.search_client:
764
  tools = [{
@@ -771,7 +1016,7 @@ class OpenAIHandler(AsyncStreamHandler):
771
  "properties": {
772
  "query": {
773
  "type": "string",
774
- "description": "The search query in Korean or English"
775
  }
776
  },
777
  "required": ["query"]
@@ -780,8 +1025,8 @@ class OpenAIHandler(AsyncStreamHandler):
780
  }]
781
  print("Web search function added to tools")
782
 
783
- instructions = (
784
- "You are a helpful assistant with web search capabilities. "
785
  "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
786
  "- Weather (날씨, 기온, 비, 눈)\n"
787
  "- News (뉴스, 소식)\n"
@@ -791,8 +1036,11 @@ class OpenAIHandler(AsyncStreamHandler):
791
  "- Any question about 2024 or 2025\n"
792
  "- Any time-sensitive information\n\n"
793
  "When in doubt, USE web_search. It's better to search and provide accurate information "
794
- "than to guess or use outdated information. Always respond in Korean when the user speaks Korean."
795
  )
 
 
 
796
 
797
  async with self.client.beta.realtime.connect(
798
  model="gpt-4o-mini-realtime-preview-2024-12-17"
@@ -805,9 +1053,23 @@ class OpenAIHandler(AsyncStreamHandler):
805
  "tool_choice": "auto" if tools else "none"
806
  }
807
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808
  await conn.session.update(session=session_update)
809
  self.connection = conn
810
- print(f"Connected with tools: {len(tools)} functions")
811
 
812
  async for event in self.connection:
813
  # Debug logging for function calls
@@ -815,7 +1077,11 @@ class OpenAIHandler(AsyncStreamHandler):
815
  print(f"Function event: {event.type}")
816
 
817
  if event.type == "response.audio_transcript.done":
818
- await self.output_queue.put(AdditionalOutputs(event))
 
 
 
 
819
 
820
  elif event.type == "response.audio.delta":
821
  await self.output_queue.put(
@@ -886,7 +1152,14 @@ class OpenAIHandler(AsyncStreamHandler):
886
  # Connection might be closed, ignore the error
887
 
888
  async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
889
- return await wait_for_item(self.output_queue)
 
 
 
 
 
 
 
890
 
891
  async def shutdown(self) -> None:
892
  if self.connection:
@@ -921,18 +1194,23 @@ stream.mount(app)
921
  # Intercept offer to capture settings
922
  @app.post("/webrtc/offer", include_in_schema=False)
923
  async def custom_offer(request: Request):
924
- """Intercept offer to capture web search settings"""
925
  body = await request.json()
926
 
927
  webrtc_id = body.get("webrtc_id")
928
  web_search_enabled = body.get("web_search_enabled", False)
 
 
929
 
930
- print(f"Custom offer - webrtc_id: {webrtc_id}, web_search_enabled: {web_search_enabled}")
 
931
 
932
  # Store settings with timestamp
933
  if webrtc_id:
934
- web_search_settings[webrtc_id] = {
935
- 'enabled': web_search_enabled,
 
 
936
  'timestamp': asyncio.get_event_loop().time()
937
  }
938
 
@@ -953,6 +1231,24 @@ async def custom_offer(request: Request):
953
  return response
954
 
955
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
956
  @app.get("/outputs")
957
  async def outputs(webrtc_id: str):
958
  """Stream outputs including search events"""
@@ -962,10 +1258,16 @@ async def outputs(webrtc_id: str):
962
  # Check if it's a search event
963
  if isinstance(output.args[0], dict) and output.args[0].get('type') == 'search':
964
  yield f"event: search\ndata: {json.dumps(output.args[0])}\n\n"
965
- # Regular transcript event
966
- elif hasattr(output.args[0], 'transcript'):
967
- s = json.dumps({"role": "assistant", "content": output.args[0].transcript})
968
- yield f"event: output\ndata: {s}\n\n"
 
 
 
 
 
 
969
 
970
  return StreamingResponse(output_stream(), media_type="text/event-stream")
971
 
 
25
 
26
  SAMPLE_RATE = 24000
27
 
28
+ # Supported languages for OpenAI Realtime API
29
+ SUPPORTED_LANGUAGES = {
30
+ "ko": "한국어 (Korean)",
31
+ "en": "English",
32
+ "es": "Español (Spanish)",
33
+ "fr": "Français (French)",
34
+ "de": "Deutsch (German)",
35
+ "it": "Italiano (Italian)",
36
+ "pt": "Português (Portuguese)",
37
+ "ru": "Русский (Russian)",
38
+ "ja": "日本語 (Japanese)",
39
+ "zh": "中文 (Chinese)",
40
+ "ar": "العربية (Arabic)",
41
+ "hi": "हिन्दी (Hindi)",
42
+ "nl": "Nederlands (Dutch)",
43
+ "pl": "Polski (Polish)",
44
+ "tr": "Türkçe (Turkish)",
45
+ "vi": "Tiếng Việt (Vietnamese)",
46
+ "th": "ไทย (Thai)",
47
+ "id": "Bahasa Indonesia",
48
+ "sv": "Svenska (Swedish)",
49
+ "da": "Dansk (Danish)",
50
+ "no": "Norsk (Norwegian)",
51
+ "fi": "Suomi (Finnish)",
52
+ "he": "עברית (Hebrew)",
53
+ "uk": "Українська (Ukrainian)",
54
+ "cs": "Čeština (Czech)",
55
+ "el": "Ελληνικά (Greek)",
56
+ "ro": "Română (Romanian)",
57
+ "hu": "Magyar (Hungarian)",
58
+ "ms": "Bahasa Melayu (Malay)"
59
+ }
60
+
61
  # HTML content embedded as a string
62
  HTML_CONTENT = """<!DOCTYPE html>
63
  <html lang="ko">
 
120
  font-size: 32px;
121
  letter-spacing: 1px;
122
  }
123
+ /* Settings section */
124
+ .settings-section {
125
+ background-color: var(--card-bg);
126
+ border-radius: 12px;
127
+ padding: 20px;
128
+ margin-bottom: 20px;
129
+ border: 1px solid var(--border-color);
130
+ }
131
+ .settings-grid {
132
+ display: grid;
133
+ grid-template-columns: 1fr 1fr;
134
+ gap: 15px;
135
+ margin-bottom: 15px;
136
+ }
137
+ .setting-item {
138
  display: flex;
139
  align-items: center;
 
140
  gap: 10px;
 
141
  }
142
+ .setting-label {
143
+ font-size: 14px;
144
+ color: #aaa;
145
+ min-width: 80px;
146
+ }
147
+ /* Toggle switch */
148
  .toggle-switch {
149
  position: relative;
150
  width: 50px;
 
170
  .toggle-switch.active .toggle-slider {
171
  transform: translateX(24px);
172
  }
173
+ /* Select dropdown */
174
+ select {
175
+ background-color: var(--card-bg);
176
+ color: var(--text-color);
177
+ border: 1px solid var(--border-color);
178
+ padding: 8px 12px;
179
+ border-radius: 6px;
180
  font-size: 14px;
181
+ cursor: pointer;
182
+ min-width: 150px;
183
+ }
184
+ select:focus {
185
+ outline: none;
186
+ border-color: var(--primary-color);
187
+ }
188
+ /* Text inputs */
189
+ .text-input-section {
190
+ margin-top: 15px;
191
+ }
192
+ input[type="text"], textarea {
193
+ width: 100%;
194
+ background-color: var(--dark-bg);
195
+ color: var(--text-color);
196
+ border: 1px solid var(--border-color);
197
+ padding: 10px;
198
+ border-radius: 6px;
199
+ font-size: 14px;
200
+ box-sizing: border-box;
201
+ margin-top: 5px;
202
+ }
203
+ input[type="text"]:focus, textarea:focus {
204
+ outline: none;
205
+ border-color: var(--primary-color);
206
+ }
207
+ textarea {
208
+ resize: vertical;
209
+ min-height: 80px;
210
  }
211
  .chat-container {
212
  border-radius: 12px;
 
276
  margin-top: 20px;
277
  display: flex;
278
  justify-content: center;
279
+ gap: 10px;
280
  flex-shrink: 0;
281
  }
282
  button {
 
305
  button:active {
306
  transform: translateY(1px);
307
  }
308
+ #send-button {
309
+ background: linear-gradient(135deg, #2ecc71, #27ae60);
310
+ padding: 10px 20px;
311
+ font-size: 14px;
312
+ }
313
+ #send-button:hover {
314
+ background: linear-gradient(135deg, #27ae60, #229954);
315
+ }
316
  #audio-output {
317
  display: none;
318
  }
 
437
  background-color: var(--secondary-color);
438
  border-radius: 50%;
439
  }
440
+ .language-info {
441
+ font-size: 12px;
442
+ color: #888;
443
+ margin-left: 5px;
444
+ }
445
  </style>
446
  </head>
447
 
 
457
  </div>
458
  <h1>MOUSE 음성 챗</h1>
459
  </div>
 
 
 
 
 
 
460
  <div class="status-indicator">
461
  <div id="status-dot" class="status-dot disconnected"></div>
462
  <span id="status-text">연결 대기 중</span>
463
  </div>
464
  </div>
465
+
466
+ <div class="settings-section">
467
+ <div class="settings-grid">
468
+ <div class="setting-item">
469
+ <span class="setting-label">웹 검색</span>
470
+ <div id="search-toggle" class="toggle-switch">
471
+ <div class="toggle-slider"></div>
472
+ </div>
473
+ </div>
474
+ <div class="setting-item">
475
+ <span class="setting-label">자동 번역</span>
476
+ <select id="language-select">
477
+ <option value="">비활성화</option>
478
+ <option value="ko">한국어 (Korean)</option>
479
+ <option value="en">English</option>
480
+ <option value="es">Español (Spanish)</option>
481
+ <option value="fr">Français (French)</option>
482
+ <option value="de">Deutsch (German)</option>
483
+ <option value="it">Italiano (Italian)</option>
484
+ <option value="pt">Português (Portuguese)</option>
485
+ <option value="ru">Русский (Russian)</option>
486
+ <option value="ja">日本語 (Japanese)</option>
487
+ <option value="zh">中文 (Chinese)</option>
488
+ <option value="ar">العربية (Arabic)</option>
489
+ <option value="hi">हिन्दी (Hindi)</option>
490
+ <option value="nl">Nederlands (Dutch)</option>
491
+ <option value="pl">Polski (Polish)</option>
492
+ <option value="tr">Türkçe (Turkish)</option>
493
+ <option value="vi">Tiếng Việt (Vietnamese)</option>
494
+ <option value="th">ไทย (Thai)</option>
495
+ <option value="id">Bahasa Indonesia</option>
496
+ <option value="sv">Svenska (Swedish)</option>
497
+ <option value="da">Dansk (Danish)</option>
498
+ <option value="no">Norsk (Norwegian)</option>
499
+ <option value="fi">Suomi (Finnish)</option>
500
+ <option value="he">עברית (Hebrew)</option>
501
+ <option value="uk">Українська (Ukrainian)</option>
502
+ <option value="cs">Čeština (Czech)</option>
503
+ <option value="el">Ελληνικά (Greek)</option>
504
+ <option value="ro">Română (Romanian)</option>
505
+ <option value="hu">Magyar (Hungarian)</option>
506
+ <option value="ms">Bahasa Melayu (Malay)</option>
507
+ </select>
508
+ </div>
509
+ </div>
510
+ <div class="text-input-section">
511
+ <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
512
+ <textarea id="system-prompt" placeholder="AI 어시스턴트의 성격, 역할, 행동 방식을 정의하세요...">You are a helpful assistant. Respond in a friendly and professional manner.</textarea>
513
+ </div>
514
+ </div>
515
+
516
  <div class="chat-container">
517
  <div class="chat-messages" id="chat-messages"></div>
518
+ <div class="text-input-section" style="margin-top: 10px;">
519
+ <input type="text" id="text-input" placeholder="텍스트 메시지를 입력하세요..." />
520
+ </div>
521
  </div>
522
  <div class="controls">
523
  <button id="start-button">대화 시작</button>
524
+ <button id="send-button" style="display: none;">전송</button>
525
  </div>
526
  </div>
527
  <audio id="audio-output"></audio>
 
530
  let peerConnection;
531
  let webrtc_id;
532
  let webSearchEnabled = false;
533
+ let selectedLanguage = "";
534
+ let systemPrompt = "You are a helpful assistant. Respond in a friendly and professional manner.";
535
  const audioOutput = document.getElementById('audio-output');
536
  const startButton = document.getElementById('start-button');
537
+ const sendButton = document.getElementById('send-button');
538
  const chatMessages = document.getElementById('chat-messages');
539
  const statusDot = document.getElementById('status-dot');
540
  const statusText = document.getElementById('status-text');
541
  const searchToggle = document.getElementById('search-toggle');
542
+ const languageSelect = document.getElementById('language-select');
543
+ const systemPromptInput = document.getElementById('system-prompt');
544
+ const textInput = document.getElementById('text-input');
545
  let audioLevel = 0;
546
  let animationFrame;
547
  let audioContext, analyser, audioSource;
548
+ let dataChannel = null;
549
 
550
  // Web search toggle functionality
551
  searchToggle.addEventListener('click', () => {
 
554
  console.log('Web search enabled:', webSearchEnabled);
555
  });
556
 
557
+ // Language selection
558
+ languageSelect.addEventListener('change', () => {
559
+ selectedLanguage = languageSelect.value;
560
+ console.log('Selected language:', selectedLanguage);
561
+ });
562
+
563
+ // System prompt update
564
+ systemPromptInput.addEventListener('input', () => {
565
+ systemPrompt = systemPromptInput.value || "You are a helpful assistant. Respond in a friendly and professional manner.";
566
+ });
567
+
568
+ // Text input handling
569
+ textInput.addEventListener('keypress', (e) => {
570
+ if (e.key === 'Enter' && !e.shiftKey) {
571
+ e.preventDefault();
572
+ sendTextMessage();
573
+ }
574
+ });
575
+
576
+ sendButton.addEventListener('click', sendTextMessage);
577
+
578
+ function sendTextMessage() {
579
+ const message = textInput.value.trim();
580
+ if (!message || !dataChannel || dataChannel.readyState !== 'open') return;
581
+
582
+ addMessage('user', message);
583
+ dataChannel.send(JSON.stringify({
584
+ type: 'text_message',
585
+ content: message
586
+ }));
587
+ textInput.value = '';
588
+ }
589
+
590
  function updateStatus(state) {
591
  statusDot.className = 'status-dot ' + state;
592
  if (state === 'connected') {
593
  statusText.textContent = '연결됨';
594
+ sendButton.style.display = 'block';
595
  } else if (state === 'connecting') {
596
  statusText.textContent = '연결 중...';
597
+ sendButton.style.display = 'none';
598
  } else {
599
  statusText.textContent = '연결 대기 중';
600
+ sendButton.style.display = 'none';
601
  }
602
  }
603
  function updateButtonState() {
 
698
  audioOutput.play();
699
  }
700
  });
701
+
702
+ // Create data channel for text messages
703
+ dataChannel = peerConnection.createDataChannel('text');
704
+ dataChannel.onopen = () => {
705
+ console.log('Data channel opened');
706
+ };
707
  dataChannel.onmessage = (event) => {
708
  const eventJson = JSON.parse(event.data);
709
  if (eventJson.type === "error") {
710
  showError(eventJson.message);
711
  }
712
  };
713
+
714
  const offer = await peerConnection.createOffer();
715
  await peerConnection.setLocalDescription(offer);
716
  await new Promise((resolve) => {
 
743
  sdp: peerConnection.localDescription.sdp,
744
  type: peerConnection.localDescription.type,
745
  webrtc_id: webrtc_id,
746
+ web_search_enabled: webSearchEnabled,
747
+ target_language: selectedLanguage,
748
+ system_prompt: systemPrompt
749
  })
750
  });
751
  const serverResponse = await response.json();
 
760
  const eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
761
  eventSource.addEventListener("output", (event) => {
762
  const eventJson = JSON.parse(event.data);
763
+ let content = eventJson.content;
764
+ if (selectedLanguage && eventJson.language) {
765
+ content += ` <span class="language-info">[${eventJson.language}]</span>`;
766
+ }
767
+ addMessage("assistant", content);
768
  });
769
  eventSource.addEventListener("search", (event) => {
770
  const eventJson = JSON.parse(event.data);
 
782
  function addMessage(role, content) {
783
  const messageDiv = document.createElement('div');
784
  messageDiv.classList.add('message', role);
785
+ if (content.includes('<span')) {
786
+ messageDiv.innerHTML = content;
787
+ } else {
788
+ messageDiv.textContent = content;
789
+ }
790
  chatMessages.appendChild(messageDiv);
791
  chatMessages.scrollTop = chatMessages.scrollHeight;
792
  }
 
816
  console.log('closing');
817
  peerConnection.close();
818
  }
819
+ dataChannel = null;
820
  updateButtonState();
821
  audioLevel = 0;
822
  }
 
882
  search_client = BraveSearchClient(brave_api_key) if brave_api_key else None
883
  print(f"Search client initialized: {search_client is not None}, API key present: {bool(brave_api_key)}")
884
 
885
+ # Store connection settings
886
+ connection_settings = {}
887
 
888
  def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
889
  chatbot.append({"role": "assistant", "content": response.transcript})
 
891
 
892
 
893
  class OpenAIHandler(AsyncStreamHandler):
894
+ def __init__(self, web_search_enabled: bool = False, target_language: str = "",
895
+ system_prompt: str = "", webrtc_id: str = None) -> None:
896
  super().__init__(
897
  expected_layout="mono",
898
  output_sample_rate=SAMPLE_RATE,
 
907
  self.current_call_id = None
908
  self.webrtc_id = webrtc_id
909
  self.web_search_enabled = web_search_enabled
910
+ self.target_language = target_language
911
+ self.system_prompt = system_prompt
912
+ print(f"Handler created with web_search_enabled={web_search_enabled}, "
913
+ f"target_language={target_language}, webrtc_id={webrtc_id}")
914
 
915
  def copy(self):
916
  # Get the most recent settings
917
+ if connection_settings:
918
  # Get the most recent webrtc_id
919
+ recent_ids = sorted(connection_settings.keys(),
920
+ key=lambda k: connection_settings[k].get('timestamp', 0),
921
  reverse=True)
922
  if recent_ids:
923
  recent_id = recent_ids[0]
924
+ settings = connection_settings[recent_id]
925
+ return OpenAIHandler(
926
+ web_search_enabled=settings.get('web_search_enabled', False),
927
+ target_language=settings.get('target_language', ''),
928
+ system_prompt=settings.get('system_prompt', ''),
929
+ webrtc_id=recent_id
930
+ )
931
 
932
  print(f"Handler.copy() called - creating new handler with default settings")
933
  return OpenAIHandler(web_search_enabled=False)
 
953
 
954
  return f"웹 검색 결과 '{query}':\n\n" + "\n".join(formatted_results)
955
 
956
+ async def process_text_message(self, message: str):
957
+ """Process text message from user"""
958
+ if self.connection:
959
+ await self.connection.conversation.item.create(
960
+ item={
961
+ "type": "message",
962
+ "role": "user",
963
+ "content": [{"type": "input_text", "text": message}]
964
+ }
965
+ )
966
+ await self.connection.response.create()
967
+
968
+ def get_translation_instructions(self):
969
+ """Get instructions for translation based on target language"""
970
+ if not self.target_language:
971
+ return ""
972
+
973
+ language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
974
+ return (
975
+ f"\n\nIMPORTANT: You must respond in {language_name} ({self.target_language}). "
976
+ f"Translate all your responses to {language_name}. "
977
+ f"This includes both spoken and written responses."
978
+ )
979
+
980
  async def start_up(self):
981
  """Connect to realtime API with function calling enabled"""
982
  # First check if we have the most recent settings
983
+ if connection_settings:
984
+ recent_ids = sorted(connection_settings.keys(),
985
+ key=lambda k: connection_settings[k].get('timestamp', 0),
986
  reverse=True)
987
  if recent_ids:
988
  recent_id = recent_ids[0]
989
+ settings = connection_settings[recent_id]
990
+ self.web_search_enabled = settings.get('web_search_enabled', False)
991
+ self.target_language = settings.get('target_language', '')
992
+ self.system_prompt = settings.get('system_prompt', '')
993
  self.webrtc_id = recent_id
994
+ print(f"start_up: Updated settings from storage - webrtc_id={self.webrtc_id}, "
995
+ f"web_search_enabled={self.web_search_enabled}, target_language={self.target_language}")
996
 
997
+ print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
998
+ f"target_language={self.target_language}")
999
  self.client = openai.AsyncOpenAI()
1000
 
1001
  # Define the web search function
1002
  tools = []
1003
+ base_instructions = self.system_prompt or "You are a helpful assistant."
1004
+
1005
+ # Add translation instructions if language is selected
1006
+ translation_instructions = self.get_translation_instructions()
1007
 
1008
  if self.web_search_enabled and self.search_client:
1009
  tools = [{
 
1016
  "properties": {
1017
  "query": {
1018
  "type": "string",
1019
+ "description": "The search query"
1020
  }
1021
  },
1022
  "required": ["query"]
 
1025
  }]
1026
  print("Web search function added to tools")
1027
 
1028
+ search_instructions = (
1029
+ "\n\nYou have web search capabilities. "
1030
  "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
1031
  "- Weather (날씨, 기온, 비, 눈)\n"
1032
  "- News (뉴스, 소식)\n"
 
1036
  "- Any question about 2024 or 2025\n"
1037
  "- Any time-sensitive information\n\n"
1038
  "When in doubt, USE web_search. It's better to search and provide accurate information "
1039
+ "than to guess or use outdated information."
1040
  )
1041
+ instructions = base_instructions + search_instructions + translation_instructions
1042
+ else:
1043
+ instructions = base_instructions + translation_instructions
1044
 
1045
  async with self.client.beta.realtime.connect(
1046
  model="gpt-4o-mini-realtime-preview-2024-12-17"
 
1053
  "tool_choice": "auto" if tools else "none"
1054
  }
1055
 
1056
+ # Add voice setting if target language is selected
1057
+ if self.target_language:
1058
+ # Map languages to appropriate voices
1059
+ voice_map = {
1060
+ "en": "alloy",
1061
+ "es": "nova",
1062
+ "fr": "nova",
1063
+ "de": "nova",
1064
+ "ja": "nova",
1065
+ "zh": "nova",
1066
+ # Default to alloy for other languages
1067
+ }
1068
+ session_update["voice"] = voice_map.get(self.target_language, "alloy")
1069
+
1070
  await conn.session.update(session=session_update)
1071
  self.connection = conn
1072
+ print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
1073
 
1074
  async for event in self.connection:
1075
  # Debug logging for function calls
 
1077
  print(f"Function event: {event.type}")
1078
 
1079
  if event.type == "response.audio_transcript.done":
1080
+ output_data = {
1081
+ "event": event,
1082
+ "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
1083
+ }
1084
+ await self.output_queue.put(AdditionalOutputs(output_data))
1085
 
1086
  elif event.type == "response.audio.delta":
1087
  await self.output_queue.put(
 
1152
  # Connection might be closed, ignore the error
1153
 
1154
  async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
1155
+ item = await wait_for_item(self.output_queue)
1156
+
1157
+ # Check if it's a dict with text message
1158
+ if isinstance(item, dict) and item.get('type') == 'text_message':
1159
+ await self.process_text_message(item['content'])
1160
+ return None
1161
+
1162
+ return item
1163
 
1164
  async def shutdown(self) -> None:
1165
  if self.connection:
 
1194
  # Intercept offer to capture settings
1195
  @app.post("/webrtc/offer", include_in_schema=False)
1196
  async def custom_offer(request: Request):
1197
+ """Intercept offer to capture settings"""
1198
  body = await request.json()
1199
 
1200
  webrtc_id = body.get("webrtc_id")
1201
  web_search_enabled = body.get("web_search_enabled", False)
1202
+ target_language = body.get("target_language", "")
1203
+ system_prompt = body.get("system_prompt", "")
1204
 
1205
+ print(f"Custom offer - webrtc_id: {webrtc_id}, web_search_enabled: {web_search_enabled}, "
1206
+ f"target_language: {target_language}")
1207
 
1208
  # Store settings with timestamp
1209
  if webrtc_id:
1210
+ connection_settings[webrtc_id] = {
1211
+ 'web_search_enabled': web_search_enabled,
1212
+ 'target_language': target_language,
1213
+ 'system_prompt': system_prompt,
1214
  'timestamp': asyncio.get_event_loop().time()
1215
  }
1216
 
 
1231
  return response
1232
 
1233
 
1234
+ @app.post("/text_message/{webrtc_id}")
1235
+ async def receive_text_message(webrtc_id: str, request: Request):
1236
+ """Receive text message from client"""
1237
+ body = await request.json()
1238
+ message = body.get("content", "")
1239
+
1240
+ # Find the handler for this connection
1241
+ if webrtc_id in stream.handlers:
1242
+ handler = stream.handlers[webrtc_id]
1243
+ # Queue the text message for processing
1244
+ await handler.output_queue.put({
1245
+ 'type': 'text_message',
1246
+ 'content': message
1247
+ })
1248
+
1249
+ return {"status": "ok"}
1250
+
1251
+
1252
  @app.get("/outputs")
1253
  async def outputs(webrtc_id: str):
1254
  """Stream outputs including search events"""
 
1258
  # Check if it's a search event
1259
  if isinstance(output.args[0], dict) and output.args[0].get('type') == 'search':
1260
  yield f"event: search\ndata: {json.dumps(output.args[0])}\n\n"
1261
+ # Regular transcript event with language info
1262
+ elif isinstance(output.args[0], dict) and 'event' in output.args[0]:
1263
+ event = output.args[0]['event']
1264
+ if hasattr(event, 'transcript'):
1265
+ data = {
1266
+ "role": "assistant",
1267
+ "content": event.transcript,
1268
+ "language": output.args[0].get('language', '')
1269
+ }
1270
+ yield f"event: output\ndata: {json.dumps(data)}\n\n"
1271
 
1272
  return StreamingResponse(output_stream(), media_type="text/event-stream")
1273