seawolf2357 commited on
Commit
f6a65af
·
verified ·
1 Parent(s): 24f7b71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +219 -41
app.py CHANGED
@@ -1211,6 +1211,17 @@ connection_settings = {}
1211
  # Initialize OpenAI client for text chat
1212
  client = openai.AsyncOpenAI()
1213
 
 
 
 
 
 
 
 
 
 
 
 
1214
  def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
1215
  chatbot.append({"role": "assistant", "content": response.transcript})
1216
  return chatbot
@@ -1234,10 +1245,38 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
1234
  try:
1235
  # Prepare system message
1236
  base_instructions = system_prompt or "You are a helpful assistant."
1237
- translation_instructions = get_translation_instructions(target_language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1238
 
1239
  messages = [
1240
- {"role": "system", "content": base_instructions + translation_instructions}
1241
  ]
1242
 
1243
  # Handle web search if enabled
@@ -1264,7 +1303,7 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
1264
 
1265
  messages.append({"role": "user", "content": message})
1266
 
1267
- # Call GPT-4o-mini
1268
  response = await client.chat.completions.create(
1269
  model="gpt-4o-mini",
1270
  messages=messages,
@@ -1272,8 +1311,14 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
1272
  max_tokens=2000
1273
  )
1274
 
 
 
 
 
 
 
1275
  return {
1276
- "response": response.choices[0].message.content,
1277
  "language": SUPPORTED_LANGUAGES.get(target_language, "") if target_language else ""
1278
  }
1279
 
@@ -1309,7 +1354,8 @@ class OpenAIHandler(AsyncStreamHandler):
1309
  self.audio_buffer = []
1310
  self.is_recording = False
1311
  self.silence_frames = 0
1312
- self.silence_threshold = 30 # Number of silent frames before stopping
 
1313
 
1314
  print(f"Handler created with web_search_enabled={web_search_enabled}, "
1315
  f"target_language={target_language}, webrtc_id={webrtc_id}, "
@@ -1409,21 +1455,32 @@ class OpenAIHandler(AsyncStreamHandler):
1409
  # 2. Translate with GPT-4o-mini
1410
  target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
1411
 
 
 
 
 
 
 
 
 
 
 
 
 
1412
  print(f"[INTERPRETATION] Translating to {target_lang_name}...")
1413
  translation_response = await self.client.chat.completions.create(
1414
  model="gpt-4o-mini",
1415
  messages=[
1416
  {
1417
  "role": "system",
1418
- "content": f"You are a translator. Translate the following text to {target_lang_name}. "
1419
- f"Provide only the translation, nothing else."
1420
  },
1421
  {
1422
  "role": "user",
1423
  "content": user_text
1424
  }
1425
  ],
1426
- temperature=0.3,
1427
  max_tokens=200
1428
  )
1429
 
@@ -1432,9 +1489,25 @@ class OpenAIHandler(AsyncStreamHandler):
1432
 
1433
  # 3. Generate speech with TTS
1434
  print("[INTERPRETATION] Generating speech...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1435
  tts_response = await self.client.audio.speech.create(
1436
  model="tts-1",
1437
- voice="alloy",
1438
  input=translated_text,
1439
  response_format="pcm", # PCM format for direct playback
1440
  speed=1.0
@@ -1445,10 +1518,10 @@ class OpenAIHandler(AsyncStreamHandler):
1445
  async for chunk in tts_response.iter_bytes(1024):
1446
  audio_bytes += chunk
1447
 
1448
- # Convert PCM to numpy array
1449
  audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
1450
 
1451
- # Resample from 24kHz (TTS output) to our sample rate if needed
1452
  if len(audio_array) > 0:
1453
  # Split audio into chunks and send
1454
  chunk_size = 480 # Match our frame size
@@ -1472,6 +1545,9 @@ class OpenAIHandler(AsyncStreamHandler):
1472
 
1473
  except Exception as e:
1474
  print(f"[INTERPRETATION] Error: {e}")
 
 
 
1475
  # Send error message to client
1476
  error_data = {
1477
  "event": type('Event', (), {
@@ -1531,6 +1607,15 @@ class OpenAIHandler(AsyncStreamHandler):
1531
  print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
1532
  print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
1533
  # Just keep the handler ready to process audio
 
 
 
 
 
 
 
 
 
1534
  return
1535
 
1536
  # Normal mode - connect to Realtime API
@@ -1539,7 +1624,68 @@ class OpenAIHandler(AsyncStreamHandler):
1539
  base_instructions = self.system_prompt or "You are a helpful assistant."
1540
 
1541
  # Add translation instructions if language is selected
1542
- translation_instructions = self.get_translation_instructions()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1543
 
1544
  if self.web_search_enabled and self.search_client:
1545
  tools = [{
@@ -1574,10 +1720,12 @@ class OpenAIHandler(AsyncStreamHandler):
1574
  "When in doubt, USE web_search. It's better to search and provide accurate information "
1575
  "than to guess or use outdated information."
1576
  )
1577
- instructions = base_instructions + search_instructions + translation_instructions
1578
  else:
1579
  instructions = base_instructions + translation_instructions
1580
 
 
 
1581
  async with self.client.beta.realtime.connect(
1582
  model="gpt-4o-mini-realtime-preview-2024-12-17"
1583
  ) as conn:
@@ -1589,19 +1737,39 @@ class OpenAIHandler(AsyncStreamHandler):
1589
  "tool_choice": "auto" if tools else "none"
1590
  }
1591
 
1592
- # Add voice setting if target language is selected
1593
  if self.target_language:
1594
- # Map languages to appropriate voices
1595
  voice_map = {
1596
- "en": "alloy",
1597
- "es": "nova",
1598
- "fr": "nova",
1599
- "de": "nova",
1600
- "ja": "nova",
1601
- "zh": "nova",
1602
- # Default to alloy for other languages
1603
  }
1604
- session_update["voice"] = voice_map.get(self.target_language, "alloy")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1605
 
1606
  await conn.session.update(session=session_update)
1607
  self.connection = conn
@@ -1613,6 +1781,9 @@ class OpenAIHandler(AsyncStreamHandler):
1613
  print(f"Function event: {event.type}")
1614
 
1615
  if event.type == "response.audio_transcript.done":
 
 
 
1616
  output_data = {
1617
  "event": event,
1618
  "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
@@ -1684,20 +1855,21 @@ class OpenAIHandler(AsyncStreamHandler):
1684
  # Simple voice activity detection
1685
  audio_level = np.abs(array).mean()
1686
 
1687
- if audio_level > 300: # Lowered threshold for better detection
 
 
1688
  self.is_recording = True
1689
  self.silence_frames = 0
1690
  self.audio_buffer.append(array)
1691
- if len(self.audio_buffer) % 10 == 0: # Log every 10 frames
1692
- print(f"[INTERPRETATION] Recording... buffer size: {len(self.audio_buffer)}, level: {audio_level:.1f}")
1693
  elif self.is_recording:
1694
  self.silence_frames += 1
1695
  self.audio_buffer.append(array)
1696
 
1697
  # If we've had enough silence, process the audio
1698
- if self.silence_frames > self.silence_threshold:
1699
- print(f"[INTERPRETATION] Silence detected, processing {len(self.audio_buffer)} frames")
1700
- await self.process_interpretation()
 
1701
  else:
1702
  # Normal mode - use Realtime API
1703
  if not self.connection:
@@ -1712,18 +1884,24 @@ class OpenAIHandler(AsyncStreamHandler):
1712
  # Connection might be closed, ignore the error
1713
 
1714
  async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
1715
- # In interpretation mode, check if we need to process buffered audio
1716
- if self.interpretation_mode and self.is_recording and self.silence_frames > self.silence_threshold:
1717
- await self.process_interpretation()
1718
-
1719
- item = await wait_for_item(self.output_queue)
1720
-
1721
- # Check if it's a dict with text message
1722
- if isinstance(item, dict) and item.get('type') == 'text_message':
1723
- await self.process_text_message(item['content'])
1724
- return None
1725
-
1726
- return item
 
 
 
 
 
 
1727
 
1728
  async def shutdown(self) -> None:
1729
  if self.interpretation_mode:
 
1211
  # Initialize OpenAI client for text chat
1212
  client = openai.AsyncOpenAI()
1213
 
1214
+ def get_translation_instructions(target_language: str) -> str:
1215
+ """Get instructions for translation based on target language"""
1216
+ if not target_language:
1217
+ return ""
1218
+
1219
+ language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
1220
+ return (
1221
+ f"\n\nIMPORTANT: You must respond in {language_name} ({target_language}). "
1222
+ f"Translate all your responses to {language_name}."
1223
+ )
1224
+
1225
  def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
1226
  chatbot.append({"role": "assistant", "content": response.transcript})
1227
  return chatbot
 
1245
  try:
1246
  # Prepare system message
1247
  base_instructions = system_prompt or "You are a helpful assistant."
1248
+
1249
+ # Add strong language instructions if language is selected
1250
+ if target_language:
1251
+ language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
1252
+
1253
+ # Language-specific instructions
1254
+ if target_language == "en":
1255
+ lang_specific = "\nYou MUST respond in English ONLY. Never use Korean or any other language."
1256
+ elif target_language == "ja":
1257
+ lang_specific = "\n日本語でのみ応答してください。韓国語や他の言語は使用しないでください。"
1258
+ elif target_language == "zh":
1259
+ lang_specific = "\n只能用中文回答。不要使用韩语或其他任何语言。"
1260
+ elif target_language == "es":
1261
+ lang_specific = "\nDebe responder SOLO en español. Nunca use coreano u otros idiomas."
1262
+ elif target_language == "fr":
1263
+ lang_specific = "\nVous devez répondre UNIQUEMENT en français. N'utilisez jamais le coréen ou d'autres langues."
1264
+ elif target_language == "de":
1265
+ lang_specific = "\nSie müssen NUR auf Deutsch antworten. Verwenden Sie niemals Koreanisch oder andere Sprachen."
1266
+ else:
1267
+ lang_specific = f"\nYou MUST respond ONLY in {language_name}. Never use any other language."
1268
+
1269
+ translation_instructions = (
1270
+ f"\n\nIMPORTANT: Your response language is set to {language_name} ({target_language})."
1271
+ f"{lang_specific}"
1272
+ f"\nEven if the user writes in Korean or another language, you must ALWAYS respond in {language_name}."
1273
+ f"\nThis is a strict requirement. Output language: {language_name} ONLY."
1274
+ )
1275
+
1276
+ base_instructions = base_instructions + translation_instructions
1277
 
1278
  messages = [
1279
+ {"role": "system", "content": base_instructions}
1280
  ]
1281
 
1282
  # Handle web search if enabled
 
1303
 
1304
  messages.append({"role": "user", "content": message})
1305
 
1306
+ # Call GPT-4o-mini with strong language enforcement
1307
  response = await client.chat.completions.create(
1308
  model="gpt-4o-mini",
1309
  messages=messages,
 
1311
  max_tokens=2000
1312
  )
1313
 
1314
+ response_text = response.choices[0].message.content
1315
+
1316
+ # Debug logging
1317
+ print(f"[TEXT CHAT] Target language: {target_language}")
1318
+ print(f"[TEXT CHAT] Response preview: {response_text[:100]}...")
1319
+
1320
  return {
1321
+ "response": response_text,
1322
  "language": SUPPORTED_LANGUAGES.get(target_language, "") if target_language else ""
1323
  }
1324
 
 
1354
  self.audio_buffer = []
1355
  self.is_recording = False
1356
  self.silence_frames = 0
1357
+ self.silence_threshold = 20 # Reduced for faster response (20 frames = ~0.4 seconds)
1358
+ self.min_audio_length = 10 # Minimum frames to consider as speech
1359
 
1360
  print(f"Handler created with web_search_enabled={web_search_enabled}, "
1361
  f"target_language={target_language}, webrtc_id={webrtc_id}, "
 
1455
  # 2. Translate with GPT-4o-mini
1456
  target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
1457
 
1458
+ # Create very specific translation prompt
1459
+ if self.interpretation_language == "en":
1460
+ system_prompt = "You are a translator. Translate Korean to English. Output ONLY the English translation, nothing else."
1461
+ elif self.interpretation_language == "ja":
1462
+ system_prompt = "You are a translator. Translate Korean to Japanese. Output ONLY the Japanese translation (日本語のみ), nothing else."
1463
+ elif self.interpretation_language == "zh":
1464
+ system_prompt = "You are a translator. Translate Korean to Chinese. Output ONLY the Chinese translation (只输出中文), nothing else."
1465
+ elif self.interpretation_language == "es":
1466
+ system_prompt = "You are a translator. Translate Korean to Spanish. Output ONLY the Spanish translation (solo español), nothing else."
1467
+ else:
1468
+ system_prompt = f"You are a translator. Translate Korean to {target_lang_name}. Output ONLY the {target_lang_name} translation, nothing else."
1469
+
1470
  print(f"[INTERPRETATION] Translating to {target_lang_name}...")
1471
  translation_response = await self.client.chat.completions.create(
1472
  model="gpt-4o-mini",
1473
  messages=[
1474
  {
1475
  "role": "system",
1476
+ "content": system_prompt
 
1477
  },
1478
  {
1479
  "role": "user",
1480
  "content": user_text
1481
  }
1482
  ],
1483
+ temperature=0.1, # Lower temperature for more literal translation
1484
  max_tokens=200
1485
  )
1486
 
 
1489
 
1490
  # 3. Generate speech with TTS
1491
  print("[INTERPRETATION] Generating speech...")
1492
+
1493
+ # Select appropriate voice for the language
1494
+ # Using voices that work better for each language
1495
+ voice_map = {
1496
+ "en": "nova", # Nova has clear English pronunciation
1497
+ "es": "nova", # Nova works well for Spanish
1498
+ "fr": "shimmer", # Shimmer works well for French
1499
+ "de": "onyx", # Onyx works well for German
1500
+ "ja": "nova", # Nova can handle Japanese
1501
+ "zh": "nova", # Nova can handle Chinese
1502
+ "ko": "nova", # Nova can handle Korean
1503
+ }
1504
+ selected_voice = voice_map.get(self.interpretation_language, "nova")
1505
+
1506
+ print(f"[INTERPRETATION] Using voice: {selected_voice} for language: {self.interpretation_language}")
1507
+
1508
  tts_response = await self.client.audio.speech.create(
1509
  model="tts-1",
1510
+ voice=selected_voice,
1511
  input=translated_text,
1512
  response_format="pcm", # PCM format for direct playback
1513
  speed=1.0
 
1518
  async for chunk in tts_response.iter_bytes(1024):
1519
  audio_bytes += chunk
1520
 
1521
+ # Convert PCM to numpy array (TTS outputs at 24kHz)
1522
  audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
1523
 
1524
+ # Send audio in chunks
1525
  if len(audio_array) > 0:
1526
  # Split audio into chunks and send
1527
  chunk_size = 480 # Match our frame size
 
1545
 
1546
  except Exception as e:
1547
  print(f"[INTERPRETATION] Error: {e}")
1548
+ import traceback
1549
+ traceback.print_exc()
1550
+
1551
  # Send error message to client
1552
  error_data = {
1553
  "event": type('Event', (), {
 
1607
  print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
1608
  print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
1609
  # Just keep the handler ready to process audio
1610
+ # Create an infinite loop to keep the handler alive
1611
+ try:
1612
+ while True:
1613
+ await asyncio.sleep(0.1)
1614
+ # Check if we need to process any audio
1615
+ if self.is_recording and self.silence_frames > self.silence_threshold:
1616
+ await self.process_interpretation()
1617
+ except asyncio.CancelledError:
1618
+ print("[INTERPRETATION MODE] Handler cancelled")
1619
  return
1620
 
1621
  # Normal mode - connect to Realtime API
 
1624
  base_instructions = self.system_prompt or "You are a helpful assistant."
1625
 
1626
  # Add translation instructions if language is selected
1627
+ if self.target_language:
1628
+ language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
1629
+
1630
+ # Create base translation instruction
1631
+ base_translation = f"You MUST respond ONLY in {language_name}."
1632
+
1633
+ # Add language-specific instructions with examples
1634
+ if self.target_language == "en":
1635
+ translation_instructions = f"""
1636
+ {base_translation}
1637
+
1638
+ CRITICAL RULES:
1639
+ 1. EVERY word you say must be in English.
1640
+ 2. Do NOT use Korean (한국어) at all.
1641
+ 3. If user speaks Korean, understand it but ALWAYS reply in English.
1642
+ 4. Example: User says "안녕하세요" → You say "Hello! How can I help you today?"
1643
+ 5. Your language mode is: ENGLISH ONLY.
1644
+ """
1645
+ elif self.target_language == "ja":
1646
+ translation_instructions = f"""
1647
+ {base_translation}
1648
+
1649
+ 重要なルール:
1650
+ 1. 必ず日本語のみで応答してください。
1651
+ 2. 韓国語は一切使用しないでください。
1652
+ 3. ユーザーが韓国語で話しても、必ず日本語で返答してください。
1653
+ 4. 例:ユーザーが「안녕하세요」と言ったら → あなたは「こんにちは!今日はどのようにお手伝いできますか?」と言う
1654
+ 5. 言語モード:日本語のみ
1655
+ """
1656
+ elif self.target_language == "zh":
1657
+ translation_instructions = f"""
1658
+ {base_translation}
1659
+
1660
+ 重要规则:
1661
+ 1. 必须只用中文回答。
1662
+ 2. 绝对不要使用韩语。
1663
+ 3. 即使用户说韩语,也必须用中文回复。
1664
+ 4. 例如:用户说"안녕하세요" → 你说"你好!我能为您做什么?"
1665
+ 5. 语言模式:仅中文
1666
+ """
1667
+ elif self.target_language == "es":
1668
+ translation_instructions = f"""
1669
+ {base_translation}
1670
+
1671
+ REGLAS CRÍTICAS:
1672
+ 1. TODAS tus palabras deben estar en español.
1673
+ 2. NO uses coreano en absoluto.
1674
+ 3. Si el usuario habla coreano, entiéndelo pero SIEMPRE responde en español.
1675
+ 4. Ejemplo: Usuario dice "안녕하세요" → Tú dices "¡Hola! ¿Cómo puedo ayudarte hoy?"
1676
+ 5. Modo de idioma: SOLO ESPAÑOL
1677
+ """
1678
+ else:
1679
+ translation_instructions = f"""
1680
+ {base_translation}
1681
+
1682
+ RULES:
1683
+ 1. You must ONLY speak in {language_name}.
1684
+ 2. NEVER use Korean or any other language.
1685
+ 3. Always respond in {language_name} regardless of what language the user speaks.
1686
+ """
1687
+ else:
1688
+ translation_instructions = ""
1689
 
1690
  if self.web_search_enabled and self.search_client:
1691
  tools = [{
 
1720
  "When in doubt, USE web_search. It's better to search and provide accurate information "
1721
  "than to guess or use outdated information."
1722
  )
1723
+ instructions = base_instructions + translation_instructions + search_instructions
1724
  else:
1725
  instructions = base_instructions + translation_instructions
1726
 
1727
+ print(f"[NORMAL MODE] Instructions: {instructions[:200]}...")
1728
+
1729
  async with self.client.beta.realtime.connect(
1730
  model="gpt-4o-mini-realtime-preview-2024-12-17"
1731
  ) as conn:
 
1737
  "tool_choice": "auto" if tools else "none"
1738
  }
1739
 
1740
+ # Use appropriate voice for the language
1741
  if self.target_language:
1742
+ # Use voice that works better for each language
1743
  voice_map = {
1744
+ "en": "nova", # Nova has clear pronunciation
1745
+ "es": "nova", # Nova works well for Spanish
1746
+ "fr": "shimmer", # Shimmer for French
1747
+ "de": "onyx", # Onyx for German
1748
+ "ja": "nova", # Nova can handle Japanese
1749
+ "zh": "nova", # Nova can handle Chinese
1750
+ "ko": "nova", # Nova can handle Korean
1751
  }
1752
+ session_update["voice"] = voice_map.get(self.target_language, "nova")
1753
+
1754
+ # Force output language settings
1755
+ session_update["modalities"] = ["text", "audio"]
1756
+ session_update["output_audio_format"] = "pcm16"
1757
+
1758
+ # Add extra language enforcement in system message
1759
+ if self.target_language == "en":
1760
+ extra_instruction = "\n\nREMINDER: Speak in English only. 英語のみで話してください。"
1761
+ elif self.target_language == "ja":
1762
+ extra_instruction = "\n\nREMINDER: 日本語のみで話してください。Speak in Japanese only."
1763
+ elif self.target_language == "zh":
1764
+ extra_instruction = "\n\nREMINDER: 只说中文。Speak in Chinese only."
1765
+ else:
1766
+ extra_instruction = ""
1767
+
1768
+ session_update["instructions"] = instructions + extra_instruction
1769
+
1770
+ print(f"[TRANSLATION MODE] Target language: {self.target_language}")
1771
+ print(f"[TRANSLATION MODE] Voice: {session_update['voice']}")
1772
+ print(f"[TRANSLATION MODE] Instructions preview: {session_update['instructions'][:200]}...")
1773
 
1774
  await conn.session.update(session=session_update)
1775
  self.connection = conn
 
1781
  print(f"Function event: {event.type}")
1782
 
1783
  if event.type == "response.audio_transcript.done":
1784
+ print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
1785
+ print(f"[RESPONSE] Expected language: {self.target_language}")
1786
+
1787
  output_data = {
1788
  "event": event,
1789
  "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
 
1855
  # Simple voice activity detection
1856
  audio_level = np.abs(array).mean()
1857
 
1858
+ if audio_level > 200: # Lower threshold for better detection
1859
+ if not self.is_recording:
1860
+ print(f"[INTERPRETATION] Started recording, level: {audio_level:.1f}")
1861
  self.is_recording = True
1862
  self.silence_frames = 0
1863
  self.audio_buffer.append(array)
 
 
1864
  elif self.is_recording:
1865
  self.silence_frames += 1
1866
  self.audio_buffer.append(array)
1867
 
1868
  # If we've had enough silence, process the audio
1869
+ if self.silence_frames > self.silence_threshold and len(self.audio_buffer) > self.min_audio_length:
1870
+ print(f"[INTERPRETATION] Silence detected after {len(self.audio_buffer)} frames")
1871
+ # Process in the background to avoid blocking
1872
+ asyncio.create_task(self.process_interpretation())
1873
  else:
1874
  # Normal mode - use Realtime API
1875
  if not self.connection:
 
1884
  # Connection might be closed, ignore the error
1885
 
1886
  async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
1887
+ # In interpretation mode, we need to keep checking for audio
1888
+ if self.interpretation_mode:
1889
+ # Use a timeout to prevent blocking forever
1890
+ try:
1891
+ item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.1)
1892
+ return item
1893
+ except asyncio.TimeoutError:
1894
+ return None
1895
+ else:
1896
+ # Normal mode
1897
+ item = await wait_for_item(self.output_queue)
1898
+
1899
+ # Check if it's a dict with text message
1900
+ if isinstance(item, dict) and item.get('type') == 'text_message':
1901
+ await self.process_text_message(item['content'])
1902
+ return None
1903
+
1904
+ return item
1905
 
1906
  async def shutdown(self) -> None:
1907
  if self.interpretation_mode: