seawolf2357 commited on
Commit
21a8848
·
verified ·
1 Parent(s): 78579ce

Update app-backup3.py

Browse files
Files changed (1) hide show
  1. app-backup3.py +125 -553
app-backup3.py CHANGED
@@ -162,20 +162,6 @@ HTML_CONTENT = """<!DOCTYPE html>
162
  gap: 15px;
163
  margin-bottom: 15px;
164
  }
165
- .interpretation-section {
166
- display: flex;
167
- flex-direction: column;
168
- gap: 15px;
169
- padding: 15px;
170
- background-color: var(--dark-bg);
171
- border-radius: 8px;
172
- margin-top: 15px;
173
- }
174
- .interpretation-info {
175
- font-size: 13px;
176
- color: #999;
177
- margin-top: 5px;
178
- }
179
  .setting-item {
180
  display: flex;
181
  align-items: center;
@@ -318,14 +304,10 @@ HTML_CONTENT = """<!DOCTYPE html>
318
  padding: 10px;
319
  margin-bottom: 10px;
320
  }
321
- .message.assistant.interpretation {
322
- background: linear-gradient(135deg, #1a5a3e, #2e7d32);
323
- font-style: italic;
324
- }
325
- .interpretation-arrow {
326
- color: #4caf50;
327
- font-weight: bold;
328
- margin: 0 10px;
329
  }
330
  .controls {
331
  text-align: center;
@@ -513,11 +495,6 @@ HTML_CONTENT = """<!DOCTYPE html>
513
  background-color: var(--secondary-color);
514
  border-radius: 50%;
515
  }
516
- .language-info {
517
- font-size: 12px;
518
- color: #888;
519
- margin-left: 5px;
520
- }
521
  </style>
522
  </head>
523
 
@@ -542,7 +519,7 @@ HTML_CONTENT = """<!DOCTYPE html>
542
  <div class="main-content">
543
  <div class="sidebar">
544
  <div class="settings-section">
545
- <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">설정</h3>
546
  <div class="settings-grid">
547
  <div class="setting-item">
548
  <span class="setting-label">웹 검색</span>
@@ -551,7 +528,7 @@ HTML_CONTENT = """<!DOCTYPE html>
551
  </div>
552
  </div>
553
  <div class="setting-item">
554
- <span class="setting-label">자동 번역</span>
555
  <select id="language-select">
556
  <option value="">비활성화</option>
557
  <option value="ko">한국어 (Korean)</option>
@@ -586,55 +563,6 @@ HTML_CONTENT = """<!DOCTYPE html>
586
  </select>
587
  </div>
588
  </div>
589
- <div class="interpretation-section">
590
- <div class="setting-item">
591
- <span class="setting-label">자동 통역</span>
592
- <div id="interpretation-toggle" class="toggle-switch">
593
- <div class="toggle-slider"></div>
594
- </div>
595
- </div>
596
- <div class="setting-item" id="interpretation-language-container" style="display: none;">
597
- <span class="setting-label">통역 언어</span>
598
- <select id="interpretation-language-select">
599
- <option value="">언어 선택</option>
600
- <option value="ko">한국어 (Korean)</option>
601
- <option value="en">English</option>
602
- <option value="es">Español (Spanish)</option>
603
- <option value="fr">Français (French)</option>
604
- <option value="de">Deutsch (German)</option>
605
- <option value="it">Italiano (Italian)</option>
606
- <option value="pt">Português (Portuguese)</option>
607
- <option value="ru">Русский (Russian)</option>
608
- <option value="ja">日本語 (Japanese)</option>
609
- <option value="zh">中文 (Chinese)</option>
610
- <option value="ar">العربية (Arabic)</option>
611
- <option value="hi">हिन्दी (Hindi)</option>
612
- <option value="nl">Nederlands (Dutch)</option>
613
- <option value="pl">Polski (Polish)</option>
614
- <option value="tr">Türkçe (Turkish)</option>
615
- <option value="vi">Tiếng Việt (Vietnamese)</option>
616
- <option value="th">ไทย (Thai)</option>
617
- <option value="id">Bahasa Indonesia</option>
618
- <option value="sv">Svenska (Swedish)</option>
619
- <option value="da">Dansk (Danish)</option>
620
- <option value="no">Norsk (Norwegian)</option>
621
- <option value="fi">Suomi (Finnish)</option>
622
- <option value="he">עברית (Hebrew)</option>
623
- <option value="uk">Українська (Ukrainian)</option>
624
- <option value="cs">Čeština (Czech)</option>
625
- <option value="el">Ελληνικά (Greek)</option>
626
- <option value="ro">Română (Romanian)</option>
627
- <option value="hu">Magyar (Hungarian)</option>
628
- <option value="ms">Bahasa Melayu (Malay)</option>
629
- </select>
630
- </div>
631
- </div>
632
- <div class="interpretation-info" id="interpretation-info" style="display: none;">
633
- <strong>통역 모드 안내:</strong><br>
634
- • 음성으로 말하면 선택한 언어로 자동 통역됩니다<br>
635
- • Whisper + GPT-4o-mini + TTS를 사용합니다<br>
636
- • 말을 마치고 잠시 기다리면 통역이 시작됩니다
637
- </div>
638
  <div class="text-input-section">
639
  <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
640
  <textarea id="system-prompt" placeholder="AI 어시스턴트의 성격, 역할, 행동 방식을 정의하세요...">You are a helpful assistant. Respond in a friendly and professional manner.</textarea>
@@ -667,8 +595,6 @@ HTML_CONTENT = """<!DOCTYPE html>
667
  let webrtc_id;
668
  let webSearchEnabled = false;
669
  let selectedLanguage = "";
670
- let interpretationMode = false;
671
- let interpretationLanguage = "";
672
  let systemPrompt = "You are a helpful assistant. Respond in a friendly and professional manner.";
673
  const audioOutput = document.getElementById('audio-output');
674
  const startButton = document.getElementById('start-button');
@@ -678,10 +604,6 @@ HTML_CONTENT = """<!DOCTYPE html>
678
  const statusText = document.getElementById('status-text');
679
  const searchToggle = document.getElementById('search-toggle');
680
  const languageSelect = document.getElementById('language-select');
681
- const interpretationToggle = document.getElementById('interpretation-toggle');
682
- const interpretationLanguageSelect = document.getElementById('interpretation-language-select');
683
- const interpretationLanguageContainer = document.getElementById('interpretation-language-container');
684
- const interpretationInfo = document.getElementById('interpretation-info');
685
  const systemPromptInput = document.getElementById('system-prompt');
686
  const textInput = document.getElementById('text-input');
687
  let audioLevel = 0;
@@ -703,86 +625,6 @@ HTML_CONTENT = """<!DOCTYPE html>
703
  console.log('Selected language:', selectedLanguage);
704
  });
705
 
706
- // Interpretation mode toggle
707
- interpretationToggle.addEventListener('click', () => {
708
- if (!interpretationMode) {
709
- // Turning ON interpretation mode
710
- interpretationLanguageContainer.style.display = 'flex';
711
- interpretationInfo.style.display = 'block';
712
-
713
- // Show language selector first
714
- showError('통역 언어를 선택해주세요.');
715
- interpretationToggle.classList.remove('active');
716
-
717
- // Don't actually enable interpretation mode until language is selected
718
- return;
719
- } else {
720
- // Turning OFF interpretation mode
721
- interpretationMode = false;
722
- interpretationToggle.classList.remove('active');
723
- interpretationLanguageContainer.style.display = 'none';
724
- interpretationInfo.style.display = 'none';
725
- interpretationLanguage = '';
726
- interpretationLanguageSelect.value = '';
727
-
728
- // Re-enable other features
729
- languageSelect.disabled = false;
730
- searchToggle.style.opacity = '1';
731
- searchToggle.style.pointerEvents = 'auto';
732
- textInput.disabled = false;
733
- textInput.placeholder = '텍스트 메시지를 입력하세요...';
734
- sendButton.style.display = 'block';
735
-
736
- console.log('Interpretation mode disabled');
737
-
738
- // If connected, restart to apply normal mode
739
- if (peerConnection && peerConnection.connectionState === 'connected') {
740
- showError('일반 모드로 전환하기 위해 연결을 다시 시작합니다.');
741
- stop();
742
- setTimeout(() => {
743
- setupWebRTC();
744
- }, 500);
745
- }
746
- }
747
-
748
- console.log('Interpretation mode:', interpretationMode);
749
- });
750
-
751
- // Interpretation language selection
752
- interpretationLanguageSelect.addEventListener('change', () => {
753
- interpretationLanguage = interpretationLanguageSelect.value;
754
- console.log('Interpretation language:', interpretationLanguage);
755
-
756
- if (interpretationLanguage && !interpretationMode) {
757
- // Now actually enable interpretation mode
758
- interpretationMode = true;
759
- interpretationToggle.classList.add('active');
760
-
761
- // Disable other features
762
- languageSelect.value = '';
763
- selectedLanguage = '';
764
- languageSelect.disabled = true;
765
- searchToggle.classList.remove('active');
766
- webSearchEnabled = false;
767
- searchToggle.style.opacity = '0.5';
768
- searchToggle.style.pointerEvents = 'none';
769
- textInput.disabled = true;
770
- textInput.placeholder = '통역 모드에서는 텍스트 입력이 지원되지 않습니다';
771
- sendButton.style.display = 'none';
772
-
773
- console.log('Interpretation mode enabled with language:', interpretationLanguage);
774
-
775
- // If already connected, restart the connection with new settings
776
- if (peerConnection && peerConnection.connectionState === 'connected') {
777
- showError('통역 모드 설정을 적용하기 위해 연결을 다시 시작합니다.');
778
- stop();
779
- setTimeout(() => {
780
- setupWebRTC();
781
- }, 500);
782
- }
783
- }
784
- });
785
-
786
  // System prompt update
787
  systemPromptInput.addEventListener('input', () => {
788
  systemPrompt = systemPromptInput.value || "You are a helpful assistant. Respond in a friendly and professional manner.";
@@ -802,12 +644,6 @@ HTML_CONTENT = """<!DOCTYPE html>
802
  const message = textInput.value.trim();
803
  if (!message) return;
804
 
805
- // Don't allow text messages in interpretation mode
806
- if (interpretationMode) {
807
- showError('통역 모드에서는 텍스트 입력이 지원되지 않습니다.');
808
- return;
809
- }
810
-
811
  // Add user message to chat
812
  addMessage('user', message);
813
  textInput.value = '';
@@ -861,18 +697,14 @@ HTML_CONTENT = """<!DOCTYPE html>
861
  statusDot.className = 'status-dot ' + state;
862
  if (state === 'connected') {
863
  statusText.textContent = '연결됨';
864
- if (!interpretationMode) {
865
- sendButton.style.display = 'block';
866
- }
867
  isVoiceActive = true;
868
  } else if (state === 'connecting') {
869
  statusText.textContent = '연결 중...';
870
  sendButton.style.display = 'none';
871
  } else {
872
  statusText.textContent = '연결 대기 중';
873
- if (!interpretationMode) {
874
- sendButton.style.display = 'block'; // Show send button even when disconnected for text chat
875
- }
876
  isVoiceActive = false;
877
  }
878
  }
@@ -1018,9 +850,7 @@ HTML_CONTENT = """<!DOCTYPE html>
1018
  webrtc_id: webrtc_id,
1019
  web_search_enabled: webSearchEnabled,
1020
  target_language: selectedLanguage,
1021
- system_prompt: systemPrompt,
1022
- interpretation_mode: interpretationMode,
1023
- interpretation_language: interpretationLanguage
1024
  });
1025
 
1026
  const response = await fetch('/webrtc/offer', {
@@ -1032,9 +862,7 @@ HTML_CONTENT = """<!DOCTYPE html>
1032
  webrtc_id: webrtc_id,
1033
  web_search_enabled: webSearchEnabled,
1034
  target_language: selectedLanguage,
1035
- system_prompt: systemPrompt,
1036
- interpretation_mode: interpretationMode,
1037
- interpretation_language: interpretationLanguage
1038
  })
1039
  });
1040
  const serverResponse = await response.json();
@@ -1051,30 +879,8 @@ HTML_CONTENT = """<!DOCTYPE html>
1051
  const eventJson = JSON.parse(event.data);
1052
  let content = eventJson.content;
1053
 
1054
- // Debug logging for interpretation mode
1055
- if (interpretationMode) {
1056
- console.log('[INTERPRETATION OUTPUT]', {
1057
- content: content,
1058
- language: eventJson.language,
1059
- mode: eventJson.mode,
1060
- expectedLanguage: interpretationLanguage
1061
- });
1062
- }
1063
-
1064
  if (selectedLanguage && eventJson.language) {
1065
  content += ` <span class="language-info">[${eventJson.language}]</span>`;
1066
- } else if (interpretationMode && eventJson.language) {
1067
- // In interpretation mode, show the translation process
1068
- if (content.includes('→')) {
1069
- // Format: "Korean text → English text"
1070
- const parts = content.split('→');
1071
- if (parts.length === 2) {
1072
- content = `<span style="color: #999;">${parts[0].trim()}</span>` +
1073
- `<span class="interpretation-arrow">→</span>` +
1074
- `<strong>${parts[1].trim()}</strong>`;
1075
- }
1076
- }
1077
- content += ` <span class="language-info">[통역: ${eventJson.language}]</span>`;
1078
  }
1079
  addMessage("assistant", content);
1080
  });
@@ -1095,11 +901,6 @@ HTML_CONTENT = """<!DOCTYPE html>
1095
  const messageDiv = document.createElement('div');
1096
  messageDiv.classList.add('message', role);
1097
 
1098
- // Check if it's an interpretation message
1099
- if (interpretationMode && role === 'assistant' && content.includes('→')) {
1100
- messageDiv.classList.add('interpretation');
1101
- }
1102
-
1103
  if (content.includes('<span')) {
1104
  messageDiv.innerHTML = content;
1105
  } else {
@@ -1108,17 +909,35 @@ HTML_CONTENT = """<!DOCTYPE html>
1108
  chatMessages.appendChild(messageDiv);
1109
  chatMessages.scrollTop = chatMessages.scrollHeight;
1110
  }
 
1111
  function stop() {
 
 
 
1112
  if (animationFrame) {
1113
  cancelAnimationFrame(animationFrame);
 
1114
  }
 
 
1115
  if (audioContext) {
1116
  audioContext.close();
1117
  audioContext = null;
1118
  analyser = null;
1119
  audioSource = null;
1120
  }
 
 
 
 
 
 
 
 
1121
  if (peerConnection) {
 
 
 
1122
  if (peerConnection.getTransceivers) {
1123
  peerConnection.getTransceivers().forEach(transceiver => {
1124
  if (transceiver.stop) {
@@ -1126,17 +945,46 @@ HTML_CONTENT = """<!DOCTYPE html>
1126
  }
1127
  });
1128
  }
 
 
1129
  if (peerConnection.getSenders) {
1130
  peerConnection.getSenders().forEach(sender => {
1131
- if (sender.track && sender.track.stop) sender.track.stop();
 
 
 
 
 
 
 
 
 
 
 
1132
  });
1133
  }
1134
- console.log('closing');
 
1135
  peerConnection.close();
 
 
 
 
 
1136
  }
1137
- dataChannel = null;
1138
- updateButtonState();
1139
  audioLevel = 0;
 
 
 
 
 
 
 
 
 
 
1140
  }
1141
  startButton.addEventListener('click', () => {
1142
  console.log('clicked');
@@ -1149,7 +997,7 @@ HTML_CONTENT = """<!DOCTYPE html>
1149
  }
1150
  });
1151
 
1152
- // Initialize send button visibility on page load
1153
  window.addEventListener('DOMContentLoaded', () => {
1154
  sendButton.style.display = 'block';
1155
  });
@@ -1227,18 +1075,6 @@ def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEve
1227
  return chatbot
1228
 
1229
 
1230
- def get_translation_instructions(target_language: str) -> str:
1231
- """Get instructions for translation based on target language"""
1232
- if not target_language:
1233
- return ""
1234
-
1235
- language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
1236
- return (
1237
- f"\n\nIMPORTANT: You must respond in {language_name} ({target_language}). "
1238
- f"Translate all your responses to {language_name}."
1239
- )
1240
-
1241
-
1242
  async def process_text_chat(message: str, web_search_enabled: bool, target_language: str,
1243
  system_prompt: str) -> Dict[str, str]:
1244
  """Process text chat using GPT-4o-mini model"""
@@ -1342,8 +1178,7 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
1342
 
1343
  class OpenAIHandler(AsyncStreamHandler):
1344
  def __init__(self, web_search_enabled: bool = False, target_language: str = "",
1345
- system_prompt: str = "", webrtc_id: str = None,
1346
- interpretation_mode: bool = False, interpretation_language: str = "") -> None:
1347
  super().__init__(
1348
  expected_layout="mono",
1349
  output_sample_rate=SAMPLE_RATE,
@@ -1360,19 +1195,9 @@ class OpenAIHandler(AsyncStreamHandler):
1360
  self.web_search_enabled = web_search_enabled
1361
  self.target_language = target_language
1362
  self.system_prompt = system_prompt
1363
- self.interpretation_mode = interpretation_mode
1364
- self.interpretation_language = interpretation_language
1365
 
1366
- # For interpretation mode
1367
- self.audio_buffer = []
1368
- self.is_recording = False
1369
- self.silence_frames = 0
1370
- self.silence_threshold = 20 # Reduced for faster response (20 frames = ~0.4 seconds)
1371
- self.min_audio_length = 10 # Minimum frames to consider as speech
1372
-
1373
- print(f"Handler created with web_search_enabled={web_search_enabled}, "
1374
- f"target_language={target_language}, webrtc_id={webrtc_id}, "
1375
- f"interpretation_mode={interpretation_mode}, interpretation_language={interpretation_language}")
1376
 
1377
  def copy(self):
1378
  # Get the most recent settings
@@ -1384,17 +1209,19 @@ class OpenAIHandler(AsyncStreamHandler):
1384
  if recent_ids:
1385
  recent_id = recent_ids[0]
1386
  settings = connection_settings[recent_id]
 
 
 
 
1387
  return OpenAIHandler(
1388
  web_search_enabled=settings.get('web_search_enabled', False),
1389
  target_language=settings.get('target_language', ''),
1390
  system_prompt=settings.get('system_prompt', ''),
1391
- webrtc_id=recent_id,
1392
- interpretation_mode=settings.get('interpretation_mode', False),
1393
- interpretation_language=settings.get('interpretation_language', '')
1394
  )
1395
 
1396
- print(f"Handler.copy() called - creating new handler with default settings")
1397
- return OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
1398
 
1399
  async def search_web(self, query: str) -> str:
1400
  """Perform web search and return formatted results"""
@@ -1429,211 +1256,9 @@ class OpenAIHandler(AsyncStreamHandler):
1429
  )
1430
  await self.connection.response.create()
1431
 
1432
- async def process_interpretation(self):
1433
- """Process audio buffer for interpretation"""
1434
- if not self.audio_buffer or not self.interpretation_language:
1435
- return
1436
-
1437
- try:
1438
- print(f"[INTERPRETATION] Processing audio buffer with {len(self.audio_buffer)} frames")
1439
-
1440
- # Convert audio buffer to WAV format
1441
- audio_data = np.concatenate(self.audio_buffer)
1442
-
1443
- # Create WAV file in memory
1444
- wav_buffer = io.BytesIO()
1445
- with wave.open(wav_buffer, 'wb') as wav_file:
1446
- wav_file.setnchannels(1) # Mono
1447
- wav_file.setsampwidth(2) # 16-bit
1448
- wav_file.setframerate(SAMPLE_RATE)
1449
- wav_file.writeframes(audio_data.tobytes())
1450
-
1451
- wav_buffer.seek(0)
1452
- wav_buffer.name = "audio.wav"
1453
-
1454
- # 1. Transcribe with Whisper
1455
- print("[INTERPRETATION] Transcribing with Whisper...")
1456
- transcript = await self.client.audio.transcriptions.create(
1457
- model="whisper-1",
1458
- file=wav_buffer,
1459
- language="ko" # Assuming Korean input
1460
- )
1461
-
1462
- user_text = transcript.text.strip()
1463
- print(f"[INTERPRETATION] Transcribed: {user_text}")
1464
-
1465
- if not user_text:
1466
- return
1467
-
1468
- # 2. Translate with GPT-4o-mini
1469
- target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
1470
-
1471
- # Create very explicit translation examples
1472
- translation_examples = {
1473
- "en": {
1474
- "안녕하세요": "Hello",
1475
- "감사합니다": "Thank you",
1476
- "오늘 날씨가 좋네요": "The weather is nice today"
1477
- },
1478
- "ja": {
1479
- "안녕하세요": "こんにちは",
1480
- "감사합니다": "ありがとうございます",
1481
- "오늘 날씨가 좋네요": "今日はいい天気ですね"
1482
- },
1483
- "zh": {
1484
- "안녕하세요": "你好",
1485
- "감사합니다": "谢谢",
1486
- "오늘 날씨가 좋네요": "今天天气很好"
1487
- },
1488
- "es": {
1489
- "안녕하세요": "Hola",
1490
- "감사합니다": "Gracias",
1491
- "오늘 날씨가 좋네요": "El clima está agradable hoy"
1492
- }
1493
- }
1494
-
1495
- examples = translation_examples.get(self.interpretation_language, translation_examples["en"])
1496
- examples_text = "\n".join([f'"{k}" → "{v}"' for k, v in examples.items()])
1497
-
1498
- # Ultra-specific prompt
1499
- system_prompt = f"""You are a Korean to {target_lang_name} translator.
1500
-
1501
- STRICT RULES:
1502
- 1. Output ONLY the {target_lang_name} translation
1503
- 2. Do NOT output Korean
1504
- 3. Do NOT add explanations
1505
- 4. Do NOT answer questions
1506
- 5. Just translate
1507
-
1508
- Examples:
1509
- {examples_text}
1510
-
1511
- Now translate the Korean text to {target_lang_name}. Output ONLY the translation in {target_lang_name}:"""
1512
-
1513
- print(f"[INTERPRETATION] Translating to {target_lang_name}...")
1514
- print(f"[INTERPRETATION] System prompt: {system_prompt}")
1515
-
1516
- translation_response = await self.client.chat.completions.create(
1517
- model="gpt-4o-mini",
1518
- messages=[
1519
- {
1520
- "role": "system",
1521
- "content": system_prompt
1522
- },
1523
- {
1524
- "role": "user",
1525
- "content": f"Translate this Korean to {target_lang_name}: {user_text}"
1526
- }
1527
- ],
1528
- temperature=0.1, # Very low temperature
1529
- max_tokens=200
1530
- )
1531
-
1532
- translated_text = translation_response.choices[0].message.content.strip()
1533
-
1534
- # Remove any Korean characters if they accidentally appear
1535
- import re
1536
- if re.search(r'[가-힣]', translated_text):
1537
- print(f"[INTERPRETATION] WARNING: Korean characters detected in translation: {translated_text}")
1538
- # Try to extract only non-Korean parts
1539
- translated_text = re.sub(r'[가-힣\s]+', ' ', translated_text).strip()
1540
-
1541
- print(f"[INTERPRETATION] Translated: {translated_text}")
1542
-
1543
- # 3. Generate speech with TTS
1544
- print(f"[INTERPRETATION] Generating speech for text: {translated_text}")
1545
-
1546
- # Select appropriate voice and ensure it speaks the target language
1547
- voice_map = {
1548
- "en": "alloy", # Alloy is native English speaker
1549
- "es": "nova", # Nova handles Spanish well
1550
- "fr": "shimmer", # Shimmer handles French well
1551
- "de": "echo", # Echo handles German well
1552
- "ja": "nova", # Nova can handle Japanese
1553
- "zh": "nova", # Nova can handle Chinese
1554
- "ko": "nova", # Nova can handle Korean
1555
- }
1556
- selected_voice = voice_map.get(self.interpretation_language, "nova")
1557
-
1558
- print(f"[INTERPRETATION] Using voice: {selected_voice} for language: {self.interpretation_language}")
1559
-
1560
- # For some languages, we might need to add pronunciation hints
1561
- if self.interpretation_language == "en" and re.search(r'[가-힣]', translated_text):
1562
- print("[INTERPRETATION] ERROR: Korean characters in English translation!")
1563
- translated_text = "Translation error occurred"
1564
-
1565
- try:
1566
- tts_response = await self.client.audio.speech.create(
1567
- model="tts-1",
1568
- voice=selected_voice,
1569
- input=translated_text,
1570
- response_format="pcm", # PCM format for direct playback
1571
- speed=1.0
1572
- )
1573
- except Exception as tts_error:
1574
- print(f"[INTERPRETATION] TTS Error: {tts_error}")
1575
- # If TTS fails, try with a different voice
1576
- tts_response = await self.client.audio.speech.create(
1577
- model="tts-1",
1578
- voice="alloy", # Fallback to alloy
1579
- input=translated_text,
1580
- response_format="pcm",
1581
- speed=1.0
1582
- )
1583
-
1584
- # Convert response to bytes
1585
- audio_bytes = b""
1586
- async for chunk in tts_response.iter_bytes(1024):
1587
- audio_bytes += chunk
1588
-
1589
- # Convert PCM to numpy array (TTS outputs at 24kHz)
1590
- audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
1591
-
1592
- # Send audio in chunks
1593
- if len(audio_array) > 0:
1594
- # Split audio into chunks and send
1595
- chunk_size = 480 # Match our frame size
1596
- for i in range(0, len(audio_array), chunk_size):
1597
- chunk = audio_array[i:i + chunk_size]
1598
- if len(chunk) < chunk_size:
1599
- # Pad the last chunk if necessary
1600
- chunk = np.pad(chunk, (0, chunk_size - len(chunk)), 'constant')
1601
-
1602
- await self.output_queue.put((SAMPLE_RATE, chunk.reshape(1, -1)))
1603
-
1604
- # Send transcript event
1605
- output_data = {
1606
- "event": type('Event', (), {
1607
- 'transcript': f"{user_text} → {translated_text}"
1608
- })(),
1609
- "language": target_lang_name,
1610
- "mode": "interpretation"
1611
- }
1612
- await self.output_queue.put(AdditionalOutputs(output_data))
1613
-
1614
- except Exception as e:
1615
- print(f"[INTERPRETATION] Error: {e}")
1616
- import traceback
1617
- traceback.print_exc()
1618
-
1619
- # Send error message to client
1620
- error_data = {
1621
- "event": type('Event', (), {
1622
- 'transcript': f"통역 오류: {str(e)}"
1623
- })(),
1624
- "language": "",
1625
- "mode": "error"
1626
- }
1627
- await self.output_queue.put(AdditionalOutputs(error_data))
1628
- finally:
1629
- # Clear the audio buffer
1630
- self.audio_buffer = []
1631
- self.is_recording = False
1632
- self.silence_frames = 0
1633
-
1634
  def get_translation_instructions(self):
1635
  """Get instructions for translation based on target language"""
1636
- if not self.target_language or self.interpretation_mode:
1637
  return ""
1638
 
1639
  language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
@@ -1644,42 +1269,24 @@ Now translate the Korean text to {target_lang_name}. Output ONLY the translation
1644
  )
1645
 
1646
  async def start_up(self):
1647
- """Connect to realtime API or setup interpretation mode"""
1648
  # First check if we have the most recent settings
1649
- if connection_settings:
1650
- recent_ids = sorted(connection_settings.keys(),
1651
- key=lambda k: connection_settings[k].get('timestamp', 0),
1652
- reverse=True)
1653
- if recent_ids:
1654
- recent_id = recent_ids[0]
1655
- settings = connection_settings[recent_id]
1656
  self.web_search_enabled = settings.get('web_search_enabled', False)
1657
  self.target_language = settings.get('target_language', '')
1658
  self.system_prompt = settings.get('system_prompt', '')
1659
- self.interpretation_mode = settings.get('interpretation_mode', False)
1660
- self.interpretation_language = settings.get('interpretation_language', '')
1661
- self.webrtc_id = recent_id
1662
- print(f"start_up: Updated settings from storage - webrtc_id={self.webrtc_id}, "
1663
- f"web_search_enabled={self.web_search_enabled}, target_language={self.target_language}, "
1664
- f"interpretation_mode={self.interpretation_mode}")
1665
- print(f"Handler interpretation settings: mode={self.interpretation_mode}, language={self.interpretation_language}")
1666
 
1667
- print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
1668
- f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
1669
- f"interpretation_language={self.interpretation_language}")
1670
 
1671
  self.client = openai.AsyncOpenAI()
1672
 
1673
- # If in interpretation mode, don't connect to Realtime API
1674
- if self.interpretation_mode:
1675
- print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
1676
- print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
1677
- # Just keep the handler ready to process audio
1678
- # Don't use infinite loop here - the handler will be called by the framework
1679
- self.client = openai.AsyncOpenAI()
1680
- return
1681
-
1682
  # Normal mode - connect to Realtime API
 
 
1683
  # Define the web search function
1684
  tools = []
1685
  base_instructions = self.system_prompt or "You are a helpful assistant."
@@ -1893,18 +1500,18 @@ RULES:
1893
  ),
1894
  )
1895
 
1896
- # Handle function calls (only in non-interpretation mode)
1897
- elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
1898
  print(f"Function call started")
1899
  self.function_call_in_progress = True
1900
  self.current_function_args = ""
1901
  self.current_call_id = getattr(event, 'call_id', None)
1902
 
1903
- elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
1904
  if self.function_call_in_progress:
1905
  self.current_function_args += event.delta
1906
 
1907
- elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
1908
  if self.function_call_in_progress:
1909
  print(f"Function call done, args: {self.current_function_args}")
1910
  try:
@@ -1940,77 +1547,41 @@ RULES:
1940
  self.current_call_id = None
1941
 
1942
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
1943
- if self.interpretation_mode:
1944
- # In interpretation mode, buffer audio and process with Whisper
 
 
 
1945
  _, array = frame
1946
  array = array.squeeze()
1947
-
1948
- # Simple voice activity detection
1949
- audio_level = np.abs(array).mean()
1950
-
1951
- if audio_level > 200: # Lower threshold for better detection
1952
- if not self.is_recording:
1953
- print(f"[INTERPRETATION] Started recording, level: {audio_level:.1f}")
1954
- self.is_recording = True
1955
- self.silence_frames = 0
1956
- self.audio_buffer.append(array)
1957
- elif self.is_recording:
1958
- self.silence_frames += 1
1959
- self.audio_buffer.append(array)
1960
-
1961
- # If we've had enough silence, process the audio
1962
- if self.silence_frames > self.silence_threshold and len(self.audio_buffer) > self.min_audio_length:
1963
- print(f"[INTERPRETATION] Silence detected after {len(self.audio_buffer)} frames")
1964
- # Process in the background to avoid blocking
1965
- asyncio.create_task(self.process_interpretation())
1966
- else:
1967
- # Normal mode - use Realtime API
1968
- if not self.connection:
1969
- return
1970
- try:
1971
- _, array = frame
1972
- array = array.squeeze()
1973
- audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
1974
- await self.connection.input_audio_buffer.append(audio=audio_message)
1975
- except Exception as e:
1976
- print(f"Error in receive: {e}")
1977
- # Connection might be closed, ignore the error
1978
 
1979
  async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
1980
- # In interpretation mode, we need to keep checking for audio
1981
- if self.interpretation_mode:
1982
- # Use a timeout to prevent blocking forever
1983
- try:
1984
- item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.1)
1985
- return item
1986
- except asyncio.TimeoutError:
1987
- return None
1988
- else:
1989
- # Normal mode
1990
- item = await wait_for_item(self.output_queue)
1991
-
1992
- # Check if it's a dict with text message
1993
- if isinstance(item, dict) and item.get('type') == 'text_message':
1994
- await self.process_text_message(item['content'])
1995
- return None
1996
-
1997
- return item
1998
 
1999
  async def shutdown(self) -> None:
2000
- if self.interpretation_mode:
2001
- # Clean up interpretation mode
2002
- self.audio_buffer = []
2003
- self.is_recording = False
2004
- print("[INTERPRETATION MODE] Shutdown complete")
2005
- else:
2006
- # Normal mode - close Realtime API connection
2007
- if self.connection:
2008
- await self.connection.close()
2009
- self.connection = None
2010
 
2011
 
2012
  # Create initial handler instance
2013
- handler = OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
2014
 
2015
  # Create components
2016
  chatbot = gr.Chatbot(type="messages")
@@ -2043,12 +1614,10 @@ async def custom_offer(request: Request):
2043
  web_search_enabled = body.get("web_search_enabled", False)
2044
  target_language = body.get("target_language", "")
2045
  system_prompt = body.get("system_prompt", "")
2046
- interpretation_mode = body.get("interpretation_mode", False)
2047
- interpretation_language = body.get("interpretation_language", "")
2048
 
2049
- print(f"Custom offer - webrtc_id: {webrtc_id}, web_search_enabled: {web_search_enabled}, "
2050
- f"target_language: {target_language}, interpretation_mode: {interpretation_mode}, "
2051
- f"interpretation_language: {interpretation_language}")
2052
 
2053
  # Store settings with timestamp
2054
  if webrtc_id:
@@ -2056,10 +1625,11 @@ async def custom_offer(request: Request):
2056
  'web_search_enabled': web_search_enabled,
2057
  'target_language': target_language,
2058
  'system_prompt': system_prompt,
2059
- 'interpretation_mode': interpretation_mode,
2060
- 'interpretation_language': interpretation_language,
2061
  'timestamp': asyncio.get_event_loop().time()
2062
  }
 
 
 
2063
 
2064
  # Remove our custom route temporarily
2065
  custom_route = None
@@ -2069,12 +1639,15 @@ async def custom_offer(request: Request):
2069
  break
2070
 
2071
  # Forward to stream's offer handler
 
2072
  response = await stream.offer(body)
2073
 
2074
  # Re-add our custom route
2075
  if custom_route:
2076
  app.routes.insert(0, custom_route)
2077
 
 
 
2078
  return response
2079
 
2080
 
@@ -2130,13 +1703,12 @@ async def outputs(webrtc_id: str):
2130
  yield f"event: search\ndata: {json.dumps(output.args[0])}\n\n"
2131
  # Regular transcript event with language info
2132
  elif isinstance(output.args[0], dict) and 'event' in output.args[0]:
2133
- event = output.args[0]['event']
2134
- if hasattr(event, 'transcript'):
2135
  data = {
2136
  "role": "assistant",
2137
- "content": event.transcript,
2138
- "language": output.args[0].get('language', ''),
2139
- "mode": output.args[0].get('mode', 'normal')
2140
  }
2141
  yield f"event: output\ndata: {json.dumps(data)}\n\n"
2142
 
 
162
  gap: 15px;
163
  margin-bottom: 15px;
164
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  .setting-item {
166
  display: flex;
167
  align-items: center;
 
304
  padding: 10px;
305
  margin-bottom: 10px;
306
  }
307
+ .language-info {
308
+ font-size: 12px;
309
+ color: #888;
310
+ margin-left: 5px;
 
 
 
 
311
  }
312
  .controls {
313
  text-align: center;
 
495
  background-color: var(--secondary-color);
496
  border-radius: 50%;
497
  }
 
 
 
 
 
498
  </style>
499
  </head>
500
 
 
519
  <div class="main-content">
520
  <div class="sidebar">
521
  <div class="settings-section">
522
+ <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">설정(텍스트 채팅에만 적용)</h3>
523
  <div class="settings-grid">
524
  <div class="setting-item">
525
  <span class="setting-label">웹 검색</span>
 
528
  </div>
529
  </div>
530
  <div class="setting-item">
531
+ <span class="setting-label">다국어 번역 채팅</span>
532
  <select id="language-select">
533
  <option value="">비활성화</option>
534
  <option value="ko">한국어 (Korean)</option>
 
563
  </select>
564
  </div>
565
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
  <div class="text-input-section">
567
  <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
568
  <textarea id="system-prompt" placeholder="AI 어시스턴트의 성격, 역할, 행동 방식을 정의하세요...">You are a helpful assistant. Respond in a friendly and professional manner.</textarea>
 
595
  let webrtc_id;
596
  let webSearchEnabled = false;
597
  let selectedLanguage = "";
 
 
598
  let systemPrompt = "You are a helpful assistant. Respond in a friendly and professional manner.";
599
  const audioOutput = document.getElementById('audio-output');
600
  const startButton = document.getElementById('start-button');
 
604
  const statusText = document.getElementById('status-text');
605
  const searchToggle = document.getElementById('search-toggle');
606
  const languageSelect = document.getElementById('language-select');
 
 
 
 
607
  const systemPromptInput = document.getElementById('system-prompt');
608
  const textInput = document.getElementById('text-input');
609
  let audioLevel = 0;
 
625
  console.log('Selected language:', selectedLanguage);
626
  });
627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628
  // System prompt update
629
  systemPromptInput.addEventListener('input', () => {
630
  systemPrompt = systemPromptInput.value || "You are a helpful assistant. Respond in a friendly and professional manner.";
 
644
  const message = textInput.value.trim();
645
  if (!message) return;
646
 
 
 
 
 
 
 
647
  // Add user message to chat
648
  addMessage('user', message);
649
  textInput.value = '';
 
697
  statusDot.className = 'status-dot ' + state;
698
  if (state === 'connected') {
699
  statusText.textContent = '연결됨';
700
+ sendButton.style.display = 'block';
 
 
701
  isVoiceActive = true;
702
  } else if (state === 'connecting') {
703
  statusText.textContent = '연결 중...';
704
  sendButton.style.display = 'none';
705
  } else {
706
  statusText.textContent = '연결 대기 중';
707
+ sendButton.style.display = 'block'; // Show send button even when disconnected for text chat
 
 
708
  isVoiceActive = false;
709
  }
710
  }
 
850
  webrtc_id: webrtc_id,
851
  web_search_enabled: webSearchEnabled,
852
  target_language: selectedLanguage,
853
+ system_prompt: systemPrompt
 
 
854
  });
855
 
856
  const response = await fetch('/webrtc/offer', {
 
862
  webrtc_id: webrtc_id,
863
  web_search_enabled: webSearchEnabled,
864
  target_language: selectedLanguage,
865
+ system_prompt: systemPrompt
 
 
866
  })
867
  });
868
  const serverResponse = await response.json();
 
879
  const eventJson = JSON.parse(event.data);
880
  let content = eventJson.content;
881
 
 
 
 
 
 
 
 
 
 
 
882
  if (selectedLanguage && eventJson.language) {
883
  content += ` <span class="language-info">[${eventJson.language}]</span>`;
 
 
 
 
 
 
 
 
 
 
 
 
884
  }
885
  addMessage("assistant", content);
886
  });
 
901
  const messageDiv = document.createElement('div');
902
  messageDiv.classList.add('message', role);
903
 
 
 
 
 
 
904
  if (content.includes('<span')) {
905
  messageDiv.innerHTML = content;
906
  } else {
 
909
  chatMessages.appendChild(messageDiv);
910
  chatMessages.scrollTop = chatMessages.scrollHeight;
911
  }
912
+
913
  function stop() {
914
+ console.log('[STOP] Stopping connection...');
915
+
916
+ // Cancel animation frame first
917
  if (animationFrame) {
918
  cancelAnimationFrame(animationFrame);
919
+ animationFrame = null;
920
  }
921
+
922
+ // Close audio context
923
  if (audioContext) {
924
  audioContext.close();
925
  audioContext = null;
926
  analyser = null;
927
  audioSource = null;
928
  }
929
+
930
+ // Close data channel
931
+ if (dataChannel) {
932
+ dataChannel.close();
933
+ dataChannel = null;
934
+ }
935
+
936
+ // Close peer connection
937
  if (peerConnection) {
938
+ console.log('[STOP] Current connection state:', peerConnection.connectionState);
939
+
940
+ // Stop all transceivers
941
  if (peerConnection.getTransceivers) {
942
  peerConnection.getTransceivers().forEach(transceiver => {
943
  if (transceiver.stop) {
 
945
  }
946
  });
947
  }
948
+
949
+ // Stop all senders
950
  if (peerConnection.getSenders) {
951
  peerConnection.getSenders().forEach(sender => {
952
+ if (sender.track) {
953
+ sender.track.stop();
954
+ }
955
+ });
956
+ }
957
+
958
+ // Stop all receivers
959
+ if (peerConnection.getReceivers) {
960
+ peerConnection.getReceivers().forEach(receiver => {
961
+ if (receiver.track) {
962
+ receiver.track.stop();
963
+ }
964
  });
965
  }
966
+
967
+ // Close the connection
968
  peerConnection.close();
969
+
970
+ // Clear the reference
971
+ peerConnection = null;
972
+
973
+ console.log('[STOP] Connection closed');
974
  }
975
+
976
+ // Reset audio level
977
  audioLevel = 0;
978
+ isVoiceActive = false;
979
+
980
+ // Update UI
981
+ updateButtonState();
982
+
983
+ // Clear any existing webrtc_id
984
+ if (webrtc_id) {
985
+ console.log('[STOP] Clearing webrtc_id:', webrtc_id);
986
+ webrtc_id = null;
987
+ }
988
  }
989
  startButton.addEventListener('click', () => {
990
  console.log('clicked');
 
997
  }
998
  });
999
 
1000
+ // Initialize on page load
1001
  window.addEventListener('DOMContentLoaded', () => {
1002
  sendButton.style.display = 'block';
1003
  });
 
1075
  return chatbot
1076
 
1077
 
 
 
 
 
 
 
 
 
 
 
 
 
1078
  async def process_text_chat(message: str, web_search_enabled: bool, target_language: str,
1079
  system_prompt: str) -> Dict[str, str]:
1080
  """Process text chat using GPT-4o-mini model"""
 
1178
 
1179
  class OpenAIHandler(AsyncStreamHandler):
1180
  def __init__(self, web_search_enabled: bool = False, target_language: str = "",
1181
+ system_prompt: str = "", webrtc_id: str = None) -> None:
 
1182
  super().__init__(
1183
  expected_layout="mono",
1184
  output_sample_rate=SAMPLE_RATE,
 
1195
  self.web_search_enabled = web_search_enabled
1196
  self.target_language = target_language
1197
  self.system_prompt = system_prompt
 
 
1198
 
1199
+ print(f"[INIT] Handler created with web_search={web_search_enabled}, "
1200
+ f"target_language={target_language}")
 
 
 
 
 
 
 
 
1201
 
1202
  def copy(self):
1203
  # Get the most recent settings
 
1209
  if recent_ids:
1210
  recent_id = recent_ids[0]
1211
  settings = connection_settings[recent_id]
1212
+
1213
+ # Log the settings being copied
1214
+ print(f"[COPY] Copying settings from {recent_id}:")
1215
+
1216
  return OpenAIHandler(
1217
  web_search_enabled=settings.get('web_search_enabled', False),
1218
  target_language=settings.get('target_language', ''),
1219
  system_prompt=settings.get('system_prompt', ''),
1220
+ webrtc_id=recent_id
 
 
1221
  )
1222
 
1223
+ print(f"[COPY] No settings found, creating default handler")
1224
+ return OpenAIHandler(web_search_enabled=False)
1225
 
1226
  async def search_web(self, query: str) -> str:
1227
  """Perform web search and return formatted results"""
 
1256
  )
1257
  await self.connection.response.create()
1258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1259
  def get_translation_instructions(self):
1260
  """Get instructions for translation based on target language"""
1261
+ if not self.target_language:
1262
  return ""
1263
 
1264
  language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
 
1269
  )
1270
 
1271
  async def start_up(self):
1272
+ """Connect to realtime API"""
1273
  # First check if we have the most recent settings
1274
+ if connection_settings and self.webrtc_id:
1275
+ if self.webrtc_id in connection_settings:
1276
+ settings = connection_settings[self.webrtc_id]
 
 
 
 
1277
  self.web_search_enabled = settings.get('web_search_enabled', False)
1278
  self.target_language = settings.get('target_language', '')
1279
  self.system_prompt = settings.get('system_prompt', '')
1280
+
1281
+ print(f"[START_UP] Updated settings from storage for {self.webrtc_id}")
 
 
 
 
 
1282
 
1283
+ print(f"[START_UP] Starting normal mode")
 
 
1284
 
1285
  self.client = openai.AsyncOpenAI()
1286
 
 
 
 
 
 
 
 
 
 
1287
  # Normal mode - connect to Realtime API
1288
+ print(f"[NORMAL MODE] Connecting to Realtime API...")
1289
+
1290
  # Define the web search function
1291
  tools = []
1292
  base_instructions = self.system_prompt or "You are a helpful assistant."
 
1500
  ),
1501
  )
1502
 
1503
+ # Handle function calls
1504
+ elif event.type == "response.function_call_arguments.start":
1505
  print(f"Function call started")
1506
  self.function_call_in_progress = True
1507
  self.current_function_args = ""
1508
  self.current_call_id = getattr(event, 'call_id', None)
1509
 
1510
+ elif event.type == "response.function_call_arguments.delta":
1511
  if self.function_call_in_progress:
1512
  self.current_function_args += event.delta
1513
 
1514
+ elif event.type == "response.function_call_arguments.done":
1515
  if self.function_call_in_progress:
1516
  print(f"Function call done, args: {self.current_function_args}")
1517
  try:
 
1547
  self.current_call_id = None
1548
 
1549
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
1550
+ # Normal mode - use Realtime API
1551
+ if not self.connection:
1552
+ print(f"[RECEIVE] No connection in normal mode, skipping")
1553
+ return
1554
+ try:
1555
  _, array = frame
1556
  array = array.squeeze()
1557
+ audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
1558
+ await self.connection.input_audio_buffer.append(audio=audio_message)
1559
+ except Exception as e:
1560
+ print(f"Error in receive: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1561
 
1562
  async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
1563
+ # Normal mode
1564
+ item = await wait_for_item(self.output_queue)
1565
+
1566
+ # Check if it's a dict with text message
1567
+ if isinstance(item, dict) and item.get('type') == 'text_message':
1568
+ await self.process_text_message(item['content'])
1569
+ return None
1570
+
1571
+ return item
 
 
 
 
 
 
 
 
 
1572
 
1573
  async def shutdown(self) -> None:
1574
+ print(f"[SHUTDOWN] Called")
1575
+
1576
+ # Normal mode - close Realtime API connection
1577
+ if self.connection:
1578
+ await self.connection.close()
1579
+ self.connection = None
1580
+ print("[NORMAL MODE] Connection closed")
 
 
 
1581
 
1582
 
1583
  # Create initial handler instance
1584
+ handler = OpenAIHandler(web_search_enabled=False)
1585
 
1586
  # Create components
1587
  chatbot = gr.Chatbot(type="messages")
 
1614
  web_search_enabled = body.get("web_search_enabled", False)
1615
  target_language = body.get("target_language", "")
1616
  system_prompt = body.get("system_prompt", "")
 
 
1617
 
1618
+ print(f"[OFFER] Received offer with webrtc_id: {webrtc_id}")
1619
+ print(f"[OFFER] web_search_enabled: {web_search_enabled}")
1620
+ print(f"[OFFER] target_language: {target_language}")
1621
 
1622
  # Store settings with timestamp
1623
  if webrtc_id:
 
1625
  'web_search_enabled': web_search_enabled,
1626
  'target_language': target_language,
1627
  'system_prompt': system_prompt,
 
 
1628
  'timestamp': asyncio.get_event_loop().time()
1629
  }
1630
+
1631
+ print(f"[OFFER] Stored settings for {webrtc_id}:")
1632
+ print(f"[OFFER] {connection_settings[webrtc_id]}")
1633
 
1634
  # Remove our custom route temporarily
1635
  custom_route = None
 
1639
  break
1640
 
1641
  # Forward to stream's offer handler
1642
+ print(f"[OFFER] Forwarding to stream.offer()")
1643
  response = await stream.offer(body)
1644
 
1645
  # Re-add our custom route
1646
  if custom_route:
1647
  app.routes.insert(0, custom_route)
1648
 
1649
+ print(f"[OFFER] Response status: {response.get('status', 'unknown') if isinstance(response, dict) else 'OK'}")
1650
+
1651
  return response
1652
 
1653
 
 
1703
  yield f"event: search\ndata: {json.dumps(output.args[0])}\n\n"
1704
  # Regular transcript event with language info
1705
  elif isinstance(output.args[0], dict) and 'event' in output.args[0]:
1706
+ event_data = output.args[0]
1707
+ if 'event' in event_data and hasattr(event_data['event'], 'transcript'):
1708
  data = {
1709
  "role": "assistant",
1710
+ "content": event_data['event'].transcript,
1711
+ "language": event_data.get('language', '')
 
1712
  }
1713
  yield f"event: output\ndata: {json.dumps(data)}\n\n"
1714