seawolf2357 commited on
Commit
78b5e61
·
verified ·
1 Parent(s): 753be88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +306 -233
app.py CHANGED
@@ -861,8 +861,8 @@ HTML_CONTENT = """<!DOCTYPE html>
861
  console.log('Selected language:', selectedLanguage);
862
  });
863
 
864
- // Interpretation mode toggle
865
- interpretationToggle.addEventListener('click', () => {
866
  if (!interpretationMode) {
867
  // Turning ON interpretation mode
868
  interpretationLanguagesContainer.style.display = 'block';
@@ -891,15 +891,22 @@ HTML_CONTENT = """<!DOCTYPE html>
891
  textInput.placeholder = '통역 모드에서는 텍스트 입력이 지원되지 않습니다';
892
  sendButton.style.display = 'none';
893
 
894
- console.log('Interpretation mode enabled with languages:', interpretationLanguages);
895
 
896
- // If connected, restart to apply interpretation mode
897
  if (peerConnection && peerConnection.connectionState === 'connected') {
 
898
  showError('통역 모드 설정을 적용하기 위해 연결을 다시 시작합니다.');
 
 
899
  stop();
900
- setTimeout(() => {
901
- setupWebRTC();
902
- }, 500);
 
 
 
 
903
  }
904
  } else {
905
  // Turning OFF interpretation mode
@@ -916,19 +923,26 @@ HTML_CONTENT = """<!DOCTYPE html>
916
  textInput.placeholder = '텍스트 메시지를 입력하세요...';
917
  sendButton.style.display = 'block';
918
 
919
- console.log('Interpretation mode disabled');
920
 
921
  // If connected, restart to apply normal mode
922
  if (peerConnection && peerConnection.connectionState === 'connected') {
 
923
  showError('일반 모드로 전환하기 위해 연결을 다시 시작합니다.');
 
 
924
  stop();
925
- setTimeout(() => {
926
- setupWebRTC();
927
- }, 500);
 
 
 
 
928
  }
929
  }
930
 
931
- console.log('Interpretation mode:', interpretationMode);
932
  });
933
 
934
  // System prompt update
@@ -1284,16 +1298,33 @@ HTML_CONTENT = """<!DOCTYPE html>
1284
  }
1285
 
1286
  function stop() {
 
 
 
1287
  if (animationFrame) {
1288
  cancelAnimationFrame(animationFrame);
 
1289
  }
 
 
1290
  if (audioContext) {
1291
  audioContext.close();
1292
  audioContext = null;
1293
  analyser = null;
1294
  audioSource = null;
1295
  }
 
 
 
 
 
 
 
 
1296
  if (peerConnection) {
 
 
 
1297
  if (peerConnection.getTransceivers) {
1298
  peerConnection.getTransceivers().forEach(transceiver => {
1299
  if (transceiver.stop) {
@@ -1301,17 +1332,46 @@ HTML_CONTENT = """<!DOCTYPE html>
1301
  }
1302
  });
1303
  }
 
 
1304
  if (peerConnection.getSenders) {
1305
  peerConnection.getSenders().forEach(sender => {
1306
- if (sender.track && sender.track.stop) sender.track.stop();
 
 
 
 
 
 
 
 
 
 
 
1307
  });
1308
  }
1309
- console.log('closing');
 
1310
  peerConnection.close();
 
 
 
 
 
1311
  }
1312
- dataChannel = null;
1313
- updateButtonState();
1314
  audioLevel = 0;
 
 
 
 
 
 
 
 
 
 
1315
  }
1316
  startButton.addEventListener('click', () => {
1317
  console.log('clicked');
@@ -1534,9 +1594,8 @@ class OpenAIHandler(AsyncStreamHandler):
1534
  self.silence_threshold = 20 # Reduced for faster response (20 frames = ~0.4 seconds)
1535
  self.min_audio_length = 10 # Minimum frames to consider as speech
1536
 
1537
- print(f"Handler created with web_search_enabled={web_search_enabled}, "
1538
- f"target_language={target_language}, webrtc_id={webrtc_id}, "
1539
- f"interpretation_mode={interpretation_mode}, interpretation_languages={interpretation_languages}")
1540
 
1541
  def copy(self):
1542
  # Get the most recent settings
@@ -1548,6 +1607,12 @@ class OpenAIHandler(AsyncStreamHandler):
1548
  if recent_ids:
1549
  recent_id = recent_ids[0]
1550
  settings = connection_settings[recent_id]
 
 
 
 
 
 
1551
  return OpenAIHandler(
1552
  web_search_enabled=settings.get('web_search_enabled', False),
1553
  target_language=settings.get('target_language', ''),
@@ -1557,7 +1622,7 @@ class OpenAIHandler(AsyncStreamHandler):
1557
  interpretation_languages=settings.get('interpretation_languages', [])
1558
  )
1559
 
1560
- print(f"Handler.copy() called - creating new handler with default settings")
1561
  return OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
1562
 
1563
  async def search_web(self, query: str) -> str:
@@ -1772,52 +1837,47 @@ Direct translation to {target_lang_name}:"""
1772
  async def start_up(self):
1773
  """Connect to realtime API or setup interpretation mode"""
1774
  # First check if we have the most recent settings
1775
- if connection_settings:
1776
- recent_ids = sorted(connection_settings.keys(),
1777
- key=lambda k: connection_settings[k].get('timestamp', 0),
1778
- reverse=True)
1779
- if recent_ids:
1780
- recent_id = recent_ids[0]
1781
- settings = connection_settings[recent_id]
1782
  self.web_search_enabled = settings.get('web_search_enabled', False)
1783
  self.target_language = settings.get('target_language', '')
1784
  self.system_prompt = settings.get('system_prompt', '')
1785
  self.interpretation_mode = settings.get('interpretation_mode', False)
1786
  self.interpretation_languages = settings.get('interpretation_languages', [])
1787
- self.webrtc_id = recent_id
1788
- print(f"start_up: Updated settings from storage - webrtc_id={self.webrtc_id}, "
1789
- f"web_search_enabled={self.web_search_enabled}, target_language={self.target_language}, "
1790
- f"interpretation_mode={self.interpretation_mode}")
1791
- print(f"Handler interpretation settings: mode={self.interpretation_mode}, languages={self.interpretation_languages}")
1792
 
1793
- print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
1794
- f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
1795
- f"interpretation_languages={self.interpretation_languages}")
1796
 
1797
  self.client = openai.AsyncOpenAI()
1798
 
1799
  # If in interpretation mode, don't connect to Realtime API
1800
  if self.interpretation_mode:
1801
- print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini (text only)")
 
1802
  print(f"[INTERPRETATION MODE] Target languages: {self.interpretation_languages}")
 
 
1803
  # Just keep the handler ready to process audio
1804
- # Keep the emit loop running for interpretation results
1805
- asyncio.create_task(self._interpretation_loop())
1806
  return
1807
 
1808
  # Normal mode - connect to Realtime API
1809
- if not self.interpretation_mode:
1810
- # Define the web search function
1811
- tools = []
1812
- base_instructions = self.system_prompt or "You are a helpful assistant."
 
 
 
 
 
1813
 
1814
- # Add translation instructions if language is selected
1815
- if self.target_language:
1816
- language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
1817
-
1818
- # Use the target language for the system prompt itself
1819
- if self.target_language == "en":
1820
- translation_instructions = """
1821
  YOU ARE AN ENGLISH-ONLY ASSISTANT.
1822
 
1823
  ABSOLUTE RULES:
@@ -1831,11 +1891,11 @@ YOUR LANGUAGE MODE: ENGLISH ONLY
1831
  DO NOT USE: 안녕하세요, 감사합니다, or any Korean
1832
  ALWAYS USE: Hello, Thank you, and English words only
1833
  """
1834
- # Override base instructions to be in English
1835
- base_instructions = "You are a helpful assistant that speaks ONLY English."
1836
-
1837
- elif self.target_language == "ja":
1838
- translation_instructions = """
1839
  あなたは日本語のみを話すアシスタントです。
1840
 
1841
  絶対的なルール:
@@ -1849,10 +1909,10 @@ ALWAYS USE: Hello, Thank you, and English words only
1849
  使用禁止:안녕하세요、감사합니다、韓国語全般
1850
  必ず使用:こんにちは、ありがとうございます、日本語のみ
1851
  """
1852
- base_instructions = "あなたは日本語のみを話す親切なアシスタントです。"
1853
-
1854
- elif self.target_language == "zh":
1855
- translation_instructions = """
1856
  你是一个只说中文的助手。
1857
 
1858
  绝对规则:
@@ -1866,10 +1926,10 @@ ALWAYS USE: Hello, Thank you, and English words only
1866
  禁止使用:안녕하세요、감사합니다、任何韩语
1867
  必须使用:你好、谢谢、只用中文
1868
  """
1869
- base_instructions = "你是一个只说中文的友好助手。"
1870
-
1871
- elif self.target_language == "es":
1872
- translation_instructions = """
1873
  ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
1874
 
1875
  REGLAS ABSOLUTAS:
@@ -1883,9 +1943,9 @@ MODO DE IDIOMA: SOLO ESPAÑOL
1883
  NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
1884
  SIEMPRE USAR: Hola, Gracias, y solo palabras en español
1885
  """
1886
- base_instructions = "Eres un asistente útil que habla SOLO español."
1887
- else:
1888
- translation_instructions = f"""
1889
  YOU MUST ONLY SPEAK {language_name.upper()}.
1890
 
1891
  RULES:
@@ -1893,181 +1953,182 @@ RULES:
1893
  2. Never use Korean
1894
  3. Always respond in {language_name}
1895
  """
1896
- base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1897
  else:
1898
- translation_instructions = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1899
 
1900
- if self.web_search_enabled and self.search_client:
1901
- tools = [{
1902
- "type": "function",
1903
- "function": {
1904
- "name": "web_search",
1905
- "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
1906
- "parameters": {
1907
- "type": "object",
1908
- "properties": {
1909
- "query": {
1910
- "type": "string",
1911
- "description": "The search query"
1912
- }
1913
- },
1914
- "required": ["query"]
1915
- }
1916
- }
1917
- }]
1918
- print("Web search function added to tools")
1919
 
1920
- search_instructions = (
1921
- "\n\nYou have web search capabilities. "
1922
- "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
1923
- "- Weather (날씨, 기온, 비, 눈)\n"
1924
- "- News (뉴스, 소식)\n"
1925
- "- Current events (현재, 최근, 오늘, 지금)\n"
1926
- "- Prices (가격, 환율, 주가)\n"
1927
- "- Sports scores or results\n"
1928
- "- Any question about 2024 or 2025\n"
1929
- "- Any time-sensitive information\n\n"
1930
- "When in doubt, USE web_search. It's better to search and provide accurate information "
1931
- "than to guess or use outdated information."
1932
- )
1933
 
1934
- # Combine all instructions
1935
- if translation_instructions:
1936
- # Translation instructions already include base_instructions
1937
- instructions = translation_instructions + search_instructions
1938
- else:
1939
- instructions = base_instructions + search_instructions
1940
- else:
1941
- # No web search
1942
- if translation_instructions:
1943
- instructions = translation_instructions
1944
- else:
1945
- instructions = base_instructions
1946
 
1947
- print(f"[NORMAL MODE] Base instructions: {base_instructions[:100]}...")
1948
- print(f"[NORMAL MODE] Translation instructions: {translation_instructions[:200] if translation_instructions else 'None'}...")
1949
- print(f"[NORMAL MODE] Combined instructions length: {len(instructions)}")
1950
- print(f"[NORMAL MODE] Target language: {self.target_language}")
1951
 
1952
- async with self.client.beta.realtime.connect(
1953
- model="gpt-4o-mini-realtime-preview-2024-12-17"
1954
- ) as conn:
1955
- # Update session with tools
1956
- session_update = {
1957
- "turn_detection": {"type": "server_vad"},
1958
- "instructions": instructions,
1959
- "tools": tools,
1960
- "tool_choice": "auto" if tools else "none",
1961
- "temperature": 0.7,
1962
- "max_response_output_tokens": 4096,
1963
- "modalities": ["text", "audio"],
1964
- "voice": "alloy" # Default voice
1965
- }
1966
 
1967
- # Use appropriate voice for the language
1968
- if self.target_language:
1969
- # Force language through multiple mechanisms
1970
- # 1. Use voice that's known to work well with the language
1971
- voice_map = {
1972
- "en": "nova", # Nova has clearer English
1973
- "es": "nova", # Nova works for Spanish
1974
- "fr": "shimmer", # Shimmer for French
1975
- "de": "echo", # Echo for German
1976
- "ja": "alloy", # Alloy can do Japanese
1977
- "zh": "alloy", # Alloy can do Chinese
1978
- "ko": "nova", # Nova for Korean
1979
- }
1980
- session_update["voice"] = voice_map.get(self.target_language, "nova")
1981
-
1982
- # 2. Add language to modalities (experimental)
1983
- session_update["modalities"] = ["text", "audio"]
1984
-
1985
- # 3. Set output format
1986
- session_update["output_audio_format"] = "pcm16"
1987
 
1988
- # 4. Add language hint to the system (if supported by API)
1989
- if self.target_language in ["en", "es", "fr", "de", "ja", "zh"]:
1990
- session_update["language"] = self.target_language # Try setting language directly
1991
-
1992
- print(f"[TRANSLATION MODE] Session update: {json.dumps(session_update, indent=2)}")
1993
 
1994
- await conn.session.update(session=session_update)
1995
- self.connection = conn
1996
- print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
 
 
 
 
 
 
1997
 
1998
- async for event in self.connection:
1999
- # Debug logging for function calls
2000
- if event.type.startswith("response.function_call"):
2001
- print(f"Function event: {event.type}")
2002
-
2003
- if event.type == "response.audio_transcript.done":
2004
- print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
2005
- print(f"[RESPONSE] Expected language: {self.target_language}")
2006
-
2007
- output_data = {
2008
- "event": event,
2009
- "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
2010
- }
2011
- await self.output_queue.put(AdditionalOutputs(output_data))
2012
-
2013
- elif event.type == "response.audio.delta":
2014
- await self.output_queue.put(
2015
- (
2016
- self.output_sample_rate,
2017
- np.frombuffer(
2018
- base64.b64decode(event.delta), dtype=np.int16
2019
- ).reshape(1, -1),
2020
- ),
2021
- )
2022
-
2023
- # Handle function calls (only in non-interpretation mode)
2024
- elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
2025
- print(f"Function call started")
2026
- self.function_call_in_progress = True
2027
- self.current_function_args = ""
2028
- self.current_call_id = getattr(event, 'call_id', None)
2029
-
2030
- elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
2031
- if self.function_call_in_progress:
2032
- self.current_function_args += event.delta
2033
-
2034
- elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
2035
- if self.function_call_in_progress:
2036
- print(f"Function call done, args: {self.current_function_args}")
2037
- try:
2038
- args = json.loads(self.current_function_args)
2039
- query = args.get("query", "")
2040
-
2041
- # Emit search event to client
2042
- await self.output_queue.put(AdditionalOutputs({
2043
- "type": "search",
2044
- "query": query
2045
- }))
2046
-
2047
- # Perform the search
2048
- search_results = await self.search_web(query)
2049
- print(f"Search results length: {len(search_results)}")
2050
-
2051
- # Send function result back to the model
2052
- if self.connection and self.current_call_id:
2053
- await self.connection.conversation.item.create(
2054
- item={
2055
- "type": "function_call_output",
2056
- "call_id": self.current_call_id,
2057
- "output": search_results
2058
- }
2059
- )
2060
- await self.connection.response.create()
2061
 
2062
- except Exception as e:
2063
- print(f"Function call error: {e}")
2064
- finally:
2065
- self.function_call_in_progress = False
2066
- self.current_function_args = ""
2067
- self.current_call_id = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2068
 
2069
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
2070
  if self.interpretation_mode:
 
2071
  # In interpretation mode, buffer audio and process with Whisper
2072
  _, array = frame
2073
  array = array.squeeze()
@@ -2093,6 +2154,7 @@ RULES:
2093
  else:
2094
  # Normal mode - use Realtime API
2095
  if not self.connection:
 
2096
  return
2097
  try:
2098
  _, array = frame
@@ -2101,16 +2163,16 @@ RULES:
2101
  await self.connection.input_audio_buffer.append(audio=audio_message)
2102
  except Exception as e:
2103
  print(f"Error in receive: {e}")
2104
- # Connection might be closed, ignore the error
2105
 
2106
  async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
2107
- # In interpretation mode, we need to keep checking for audio
2108
  if self.interpretation_mode:
2109
- # Use a timeout to prevent blocking forever
2110
  try:
2111
- item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.1)
2112
  return item
2113
  except asyncio.TimeoutError:
 
2114
  return None
2115
  else:
2116
  # Normal mode
@@ -2124,6 +2186,8 @@ RULES:
2124
  return item
2125
 
2126
  async def shutdown(self) -> None:
 
 
2127
  if self.interpretation_mode:
2128
  # Clean up interpretation mode
2129
  self.audio_buffer = []
@@ -2134,6 +2198,7 @@ RULES:
2134
  if self.connection:
2135
  await self.connection.close()
2136
  self.connection = None
 
2137
 
2138
 
2139
  # Create initial handler instance
@@ -2173,9 +2238,11 @@ async def custom_offer(request: Request):
2173
  interpretation_mode = body.get("interpretation_mode", False)
2174
  interpretation_languages = body.get("interpretation_languages", [])
2175
 
2176
- print(f"Custom offer - webrtc_id: {webrtc_id}, web_search_enabled: {web_search_enabled}, "
2177
- f"target_language: {target_language}, interpretation_mode: {interpretation_mode}, "
2178
- f"interpretation_languages: {interpretation_languages}")
 
 
2179
 
2180
  # Store settings with timestamp
2181
  if webrtc_id:
@@ -2187,6 +2254,9 @@ async def custom_offer(request: Request):
2187
  'interpretation_languages': interpretation_languages,
2188
  'timestamp': asyncio.get_event_loop().time()
2189
  }
 
 
 
2190
 
2191
  # Remove our custom route temporarily
2192
  custom_route = None
@@ -2196,12 +2266,15 @@ async def custom_offer(request: Request):
2196
  break
2197
 
2198
  # Forward to stream's offer handler
 
2199
  response = await stream.offer(body)
2200
 
2201
  # Re-add our custom route
2202
  if custom_route:
2203
  app.routes.insert(0, custom_route)
2204
 
 
 
2205
  return response
2206
 
2207
 
 
861
  console.log('Selected language:', selectedLanguage);
862
  });
863
 
864
+ // Interpretation mode toggle - 수정된 버전
865
+ interpretationToggle.addEventListener('click', async () => {
866
  if (!interpretationMode) {
867
  // Turning ON interpretation mode
868
  interpretationLanguagesContainer.style.display = 'block';
 
891
  textInput.placeholder = '통역 모드에서는 텍스트 입력이 지원되지 않습니다';
892
  sendButton.style.display = 'none';
893
 
894
+ console.log('[FRONTEND] Interpretation mode enabled with languages:', interpretationLanguages);
895
 
896
+ // If connected, stop and restart with interpretation mode
897
  if (peerConnection && peerConnection.connectionState === 'connected') {
898
+ console.log('[FRONTEND] Stopping current connection for interpretation mode');
899
  showError('통역 모드 설정을 적용하기 위해 연결을 다시 시작합니다.');
900
+
901
+ // Force stop the connection
902
  stop();
903
+
904
+ // Wait a bit longer to ensure cleanup
905
+ await new Promise(resolve => setTimeout(resolve, 1000));
906
+
907
+ // Start new connection with interpretation mode
908
+ console.log('[FRONTEND] Starting new connection with interpretation mode');
909
+ setupWebRTC();
910
  }
911
  } else {
912
  // Turning OFF interpretation mode
 
923
  textInput.placeholder = '텍스트 메시지를 입력하세요...';
924
  sendButton.style.display = 'block';
925
 
926
+ console.log('[FRONTEND] Interpretation mode disabled');
927
 
928
  // If connected, restart to apply normal mode
929
  if (peerConnection && peerConnection.connectionState === 'connected') {
930
+ console.log('[FRONTEND] Stopping current connection for normal mode');
931
  showError('일반 모드로 전환하기 위해 연결을 다시 시작합니다.');
932
+
933
+ // Force stop the connection
934
  stop();
935
+
936
+ // Wait a bit longer to ensure cleanup
937
+ await new Promise(resolve => setTimeout(resolve, 1000));
938
+
939
+ // Start new connection with normal mode
940
+ console.log('[FRONTEND] Starting new connection with normal mode');
941
+ setupWebRTC();
942
  }
943
  }
944
 
945
+ console.log('[FRONTEND] Final interpretation mode:', interpretationMode);
946
  });
947
 
948
  // System prompt update
 
1298
  }
1299
 
1300
  function stop() {
1301
+ console.log('[STOP] Stopping connection...');
1302
+
1303
+ // Cancel animation frame first
1304
  if (animationFrame) {
1305
  cancelAnimationFrame(animationFrame);
1306
+ animationFrame = null;
1307
  }
1308
+
1309
+ // Close audio context
1310
  if (audioContext) {
1311
  audioContext.close();
1312
  audioContext = null;
1313
  analyser = null;
1314
  audioSource = null;
1315
  }
1316
+
1317
+ // Close data channel
1318
+ if (dataChannel) {
1319
+ dataChannel.close();
1320
+ dataChannel = null;
1321
+ }
1322
+
1323
+ // Close peer connection
1324
  if (peerConnection) {
1325
+ console.log('[STOP] Current connection state:', peerConnection.connectionState);
1326
+
1327
+ // Stop all transceivers
1328
  if (peerConnection.getTransceivers) {
1329
  peerConnection.getTransceivers().forEach(transceiver => {
1330
  if (transceiver.stop) {
 
1332
  }
1333
  });
1334
  }
1335
+
1336
+ // Stop all senders
1337
  if (peerConnection.getSenders) {
1338
  peerConnection.getSenders().forEach(sender => {
1339
+ if (sender.track) {
1340
+ sender.track.stop();
1341
+ }
1342
+ });
1343
+ }
1344
+
1345
+ // Stop all receivers
1346
+ if (peerConnection.getReceivers) {
1347
+ peerConnection.getReceivers().forEach(receiver => {
1348
+ if (receiver.track) {
1349
+ receiver.track.stop();
1350
+ }
1351
  });
1352
  }
1353
+
1354
+ // Close the connection
1355
  peerConnection.close();
1356
+
1357
+ // Clear the reference
1358
+ peerConnection = null;
1359
+
1360
+ console.log('[STOP] Connection closed');
1361
  }
1362
+
1363
+ // Reset audio level
1364
  audioLevel = 0;
1365
+ isVoiceActive = false;
1366
+
1367
+ // Update UI
1368
+ updateButtonState();
1369
+
1370
+ // Clear any existing webrtc_id
1371
+ if (webrtc_id) {
1372
+ console.log('[STOP] Clearing webrtc_id:', webrtc_id);
1373
+ webrtc_id = null;
1374
+ }
1375
  }
1376
  startButton.addEventListener('click', () => {
1377
  console.log('clicked');
 
1594
  self.silence_threshold = 20 # Reduced for faster response (20 frames = ~0.4 seconds)
1595
  self.min_audio_length = 10 # Minimum frames to consider as speech
1596
 
1597
+ print(f"[INIT] Handler created with interpretation_mode={interpretation_mode}, "
1598
+ f"interpretation_languages={interpretation_languages}")
 
1599
 
1600
  def copy(self):
1601
  # Get the most recent settings
 
1607
  if recent_ids:
1608
  recent_id = recent_ids[0]
1609
  settings = connection_settings[recent_id]
1610
+
1611
+ # Log the settings being copied
1612
+ print(f"[COPY] Copying settings from {recent_id}:")
1613
+ print(f"[COPY] interpretation_mode={settings.get('interpretation_mode', False)}")
1614
+ print(f"[COPY] interpretation_languages={settings.get('interpretation_languages', [])}")
1615
+
1616
  return OpenAIHandler(
1617
  web_search_enabled=settings.get('web_search_enabled', False),
1618
  target_language=settings.get('target_language', ''),
 
1622
  interpretation_languages=settings.get('interpretation_languages', [])
1623
  )
1624
 
1625
+ print(f"[COPY] No settings found, creating default handler")
1626
  return OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
1627
 
1628
  async def search_web(self, query: str) -> str:
 
1837
  async def start_up(self):
1838
  """Connect to realtime API or setup interpretation mode"""
1839
  # First check if we have the most recent settings
1840
+ if connection_settings and self.webrtc_id:
1841
+ if self.webrtc_id in connection_settings:
1842
+ settings = connection_settings[self.webrtc_id]
 
 
 
 
1843
  self.web_search_enabled = settings.get('web_search_enabled', False)
1844
  self.target_language = settings.get('target_language', '')
1845
  self.system_prompt = settings.get('system_prompt', '')
1846
  self.interpretation_mode = settings.get('interpretation_mode', False)
1847
  self.interpretation_languages = settings.get('interpretation_languages', [])
1848
+
1849
+ print(f"[START_UP] Updated settings from storage for {self.webrtc_id}")
1850
+ print(f"[START_UP] interpretation_mode={self.interpretation_mode}")
1851
+ print(f"[START_UP] interpretation_languages={self.interpretation_languages}")
 
1852
 
1853
+ print(f"[START_UP] Starting with interpretation_mode={self.interpretation_mode}")
 
 
1854
 
1855
  self.client = openai.AsyncOpenAI()
1856
 
1857
  # If in interpretation mode, don't connect to Realtime API
1858
  if self.interpretation_mode:
1859
+ print(f"[INTERPRETATION MODE] Active - Skipping Realtime API connection")
1860
+ print(f"[INTERPRETATION MODE] Using Whisper + GPT-4o-mini (text only)")
1861
  print(f"[INTERPRETATION MODE] Target languages: {self.interpretation_languages}")
1862
+
1863
+ # Do NOT connect to Realtime API
1864
  # Just keep the handler ready to process audio
 
 
1865
  return
1866
 
1867
  # Normal mode - connect to Realtime API
1868
+ print(f"[NORMAL MODE] Connecting to Realtime API...")
1869
+
1870
+ # Define the web search function
1871
+ tools = []
1872
+ base_instructions = self.system_prompt or "You are a helpful assistant."
1873
+
1874
+ # Add translation instructions if language is selected
1875
+ if self.target_language:
1876
+ language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
1877
 
1878
+ # Use the target language for the system prompt itself
1879
+ if self.target_language == "en":
1880
+ translation_instructions = """
 
 
 
 
1881
  YOU ARE AN ENGLISH-ONLY ASSISTANT.
1882
 
1883
  ABSOLUTE RULES:
 
1891
  DO NOT USE: 안녕하세요, 감사합니다, or any Korean
1892
  ALWAYS USE: Hello, Thank you, and English words only
1893
  """
1894
+ # Override base instructions to be in English
1895
+ base_instructions = "You are a helpful assistant that speaks ONLY English."
1896
+
1897
+ elif self.target_language == "ja":
1898
+ translation_instructions = """
1899
  あなたは日本語のみを話すアシスタントです。
1900
 
1901
  絶対的なルール:
 
1909
  使用禁止:안녕하세요、감사합니다、韓国語全般
1910
  必ず使用:こんにちは、ありがとうございます、日本語のみ
1911
  """
1912
+ base_instructions = "あなたは日本語のみを話す親切なアシスタントです。"
1913
+
1914
+ elif self.target_language == "zh":
1915
+ translation_instructions = """
1916
  你是一个只说中文的助手。
1917
 
1918
  绝对规则:
 
1926
  禁止使用:안녕하세요、감사합니다、任何韩语
1927
  必须使用:你好、谢谢、只用中文
1928
  """
1929
+ base_instructions = "你是一个只说中文的友好助手。"
1930
+
1931
+ elif self.target_language == "es":
1932
+ translation_instructions = """
1933
  ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
1934
 
1935
  REGLAS ABSOLUTAS:
 
1943
  NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
1944
  SIEMPRE USAR: Hola, Gracias, y solo palabras en español
1945
  """
1946
+ base_instructions = "Eres un asistente útil que habla SOLO español."
1947
+ else:
1948
+ translation_instructions = f"""
1949
  YOU MUST ONLY SPEAK {language_name.upper()}.
1950
 
1951
  RULES:
 
1953
  2. Never use Korean
1954
  3. Always respond in {language_name}
1955
  """
1956
+ base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}."
1957
+ else:
1958
+ translation_instructions = ""
1959
+
1960
+ if self.web_search_enabled and self.search_client:
1961
+ tools = [{
1962
+ "type": "function",
1963
+ "function": {
1964
+ "name": "web_search",
1965
+ "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
1966
+ "parameters": {
1967
+ "type": "object",
1968
+ "properties": {
1969
+ "query": {
1970
+ "type": "string",
1971
+ "description": "The search query"
1972
+ }
1973
+ },
1974
+ "required": ["query"]
1975
+ }
1976
+ }
1977
+ }]
1978
+ print("Web search function added to tools")
1979
+
1980
+ search_instructions = (
1981
+ "\n\nYou have web search capabilities. "
1982
+ "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
1983
+ "- Weather (날씨, 기온, 비, 눈)\n"
1984
+ "- News (뉴스, 소식)\n"
1985
+ "- Current events (현재, 최근, 오늘, 지금)\n"
1986
+ "- Prices (가격, 환율, 주가)\n"
1987
+ "- Sports scores or results\n"
1988
+ "- Any question about 2024 or 2025\n"
1989
+ "- Any time-sensitive information\n\n"
1990
+ "When in doubt, USE web_search. It's better to search and provide accurate information "
1991
+ "than to guess or use outdated information."
1992
+ )
1993
+
1994
+ # Combine all instructions
1995
+ if translation_instructions:
1996
+ # Translation instructions already include base_instructions
1997
+ instructions = translation_instructions + search_instructions
1998
  else:
1999
+ instructions = base_instructions + search_instructions
2000
+ else:
2001
+ # No web search
2002
+ if translation_instructions:
2003
+ instructions = translation_instructions
2004
+ else:
2005
+ instructions = base_instructions
2006
+
2007
+ print(f"[NORMAL MODE] Base instructions: {base_instructions[:100]}...")
2008
+ print(f"[NORMAL MODE] Translation instructions: {translation_instructions[:200] if translation_instructions else 'None'}...")
2009
+ print(f"[NORMAL MODE] Combined instructions length: {len(instructions)}")
2010
+ print(f"[NORMAL MODE] Target language: {self.target_language}")
2011
+
2012
+ async with self.client.beta.realtime.connect(
2013
+ model="gpt-4o-mini-realtime-preview-2024-12-17"
2014
+ ) as conn:
2015
+ # Update session with tools
2016
+ session_update = {
2017
+ "turn_detection": {"type": "server_vad"},
2018
+ "instructions": instructions,
2019
+ "tools": tools,
2020
+ "tool_choice": "auto" if tools else "none",
2021
+ "temperature": 0.7,
2022
+ "max_response_output_tokens": 4096,
2023
+ "modalities": ["text", "audio"],
2024
+ "voice": "alloy" # Default voice
2025
+ }
2026
 
2027
+ # Use appropriate voice for the language
2028
+ if self.target_language:
2029
+ # Force language through multiple mechanisms
2030
+ # 1. Use voice that's known to work well with the language
2031
+ voice_map = {
2032
+ "en": "nova", # Nova has clearer English
2033
+ "es": "nova", # Nova works for Spanish
2034
+ "fr": "shimmer", # Shimmer for French
2035
+ "de": "echo", # Echo for German
2036
+ "ja": "alloy", # Alloy can do Japanese
2037
+ "zh": "alloy", # Alloy can do Chinese
2038
+ "ko": "nova", # Nova for Korean
2039
+ }
2040
+ session_update["voice"] = voice_map.get(self.target_language, "nova")
 
 
 
 
 
2041
 
2042
+ # 2. Add language to modalities (experimental)
2043
+ session_update["modalities"] = ["text", "audio"]
 
 
 
 
 
 
 
 
 
 
 
2044
 
2045
+ # 3. Set output format
2046
+ session_update["output_audio_format"] = "pcm16"
2047
+
2048
+ # 4. Add language hint to the system (if supported by API)
2049
+ if self.target_language in ["en", "es", "fr", "de", "ja", "zh"]:
2050
+ session_update["language"] = self.target_language # Try setting language directly
2051
+
2052
+ print(f"[TRANSLATION MODE] Session update: {json.dumps(session_update, indent=2)}")
 
 
 
 
2053
 
2054
+ await conn.session.update(session=session_update)
2055
+ self.connection = conn
2056
+ print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
 
2057
 
2058
+ async for event in self.connection:
2059
+ # Debug logging for function calls
2060
+ if event.type.startswith("response.function_call"):
2061
+ print(f"Function event: {event.type}")
 
 
 
 
 
 
 
 
 
 
2062
 
2063
+ if event.type == "response.audio_transcript.done":
2064
+ print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
2065
+ print(f"[RESPONSE] Expected language: {self.target_language}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2066
 
2067
+ output_data = {
2068
+ "event": event,
2069
+ "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
2070
+ }
2071
+ await self.output_queue.put(AdditionalOutputs(output_data))
2072
 
2073
+ elif event.type == "response.audio.delta":
2074
+ await self.output_queue.put(
2075
+ (
2076
+ self.output_sample_rate,
2077
+ np.frombuffer(
2078
+ base64.b64decode(event.delta), dtype=np.int16
2079
+ ).reshape(1, -1),
2080
+ ),
2081
+ )
2082
 
2083
+ # Handle function calls (only in non-interpretation mode)
2084
+ elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
2085
+ print(f"Function call started")
2086
+ self.function_call_in_progress = True
2087
+ self.current_function_args = ""
2088
+ self.current_call_id = getattr(event, 'call_id', None)
2089
+
2090
+ elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
2091
+ if self.function_call_in_progress:
2092
+ self.current_function_args += event.delta
2093
+
2094
+ elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
2095
+ if self.function_call_in_progress:
2096
+ print(f"Function call done, args: {self.current_function_args}")
2097
+ try:
2098
+ args = json.loads(self.current_function_args)
2099
+ query = args.get("query", "")
2100
+
2101
+ # Emit search event to client
2102
+ await self.output_queue.put(AdditionalOutputs({
2103
+ "type": "search",
2104
+ "query": query
2105
+ }))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2106
 
2107
+ # Perform the search
2108
+ search_results = await self.search_web(query)
2109
+ print(f"Search results length: {len(search_results)}")
2110
+
2111
+ # Send function result back to the model
2112
+ if self.connection and self.current_call_id:
2113
+ await self.connection.conversation.item.create(
2114
+ item={
2115
+ "type": "function_call_output",
2116
+ "call_id": self.current_call_id,
2117
+ "output": search_results
2118
+ }
2119
+ )
2120
+ await self.connection.response.create()
2121
+
2122
+ except Exception as e:
2123
+ print(f"Function call error: {e}")
2124
+ finally:
2125
+ self.function_call_in_progress = False
2126
+ self.current_function_args = ""
2127
+ self.current_call_id = None
2128
 
2129
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
2130
  if self.interpretation_mode:
2131
+ print(f"[RECEIVE] Processing in interpretation mode")
2132
  # In interpretation mode, buffer audio and process with Whisper
2133
  _, array = frame
2134
  array = array.squeeze()
 
2154
  else:
2155
  # Normal mode - use Realtime API
2156
  if not self.connection:
2157
+ print(f"[RECEIVE] No connection in normal mode, skipping")
2158
  return
2159
  try:
2160
  _, array = frame
 
2163
  await self.connection.input_audio_buffer.append(audio=audio_message)
2164
  except Exception as e:
2165
  print(f"Error in receive: {e}")
 
2166
 
2167
  async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
2168
+ # In interpretation mode, we need to keep checking for outputs
2169
  if self.interpretation_mode:
2170
+ # Use a short timeout to prevent blocking
2171
  try:
2172
+ item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.05)
2173
  return item
2174
  except asyncio.TimeoutError:
2175
+ # Return None to keep the stream alive
2176
  return None
2177
  else:
2178
  # Normal mode
 
2186
  return item
2187
 
2188
  async def shutdown(self) -> None:
2189
+ print(f"[SHUTDOWN] Called with interpretation_mode={self.interpretation_mode}")
2190
+
2191
  if self.interpretation_mode:
2192
  # Clean up interpretation mode
2193
  self.audio_buffer = []
 
2198
  if self.connection:
2199
  await self.connection.close()
2200
  self.connection = None
2201
+ print("[NORMAL MODE] Connection closed")
2202
 
2203
 
2204
  # Create initial handler instance
 
2238
  interpretation_mode = body.get("interpretation_mode", False)
2239
  interpretation_languages = body.get("interpretation_languages", [])
2240
 
2241
+ print(f"[OFFER] Received offer with webrtc_id: {webrtc_id}")
2242
+ print(f"[OFFER] interpretation_mode: {interpretation_mode}")
2243
+ print(f"[OFFER] interpretation_languages: {interpretation_languages}")
2244
+ print(f"[OFFER] web_search_enabled: {web_search_enabled}")
2245
+ print(f"[OFFER] target_language: {target_language}")
2246
 
2247
  # Store settings with timestamp
2248
  if webrtc_id:
 
2254
  'interpretation_languages': interpretation_languages,
2255
  'timestamp': asyncio.get_event_loop().time()
2256
  }
2257
+
2258
+ print(f"[OFFER] Stored settings for {webrtc_id}:")
2259
+ print(f"[OFFER] {connection_settings[webrtc_id]}")
2260
 
2261
  # Remove our custom route temporarily
2262
  custom_route = None
 
2266
  break
2267
 
2268
  # Forward to stream's offer handler
2269
+ print(f"[OFFER] Forwarding to stream.offer()")
2270
  response = await stream.offer(body)
2271
 
2272
  # Re-add our custom route
2273
  if custom_route:
2274
  app.routes.insert(0, custom_route)
2275
 
2276
+ print(f"[OFFER] Response status: {response.get('status', 'unknown') if isinstance(response, dict) else 'OK'}")
2277
+
2278
  return response
2279
 
2280