Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1211,6 +1211,17 @@ connection_settings = {}
|
|
1211 |
# Initialize OpenAI client for text chat
|
1212 |
client = openai.AsyncOpenAI()
|
1213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1214 |
def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
|
1215 |
chatbot.append({"role": "assistant", "content": response.transcript})
|
1216 |
return chatbot
|
@@ -1234,10 +1245,38 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
|
|
1234 |
try:
|
1235 |
# Prepare system message
|
1236 |
base_instructions = system_prompt or "You are a helpful assistant."
|
1237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1238 |
|
1239 |
messages = [
|
1240 |
-
{"role": "system", "content": base_instructions
|
1241 |
]
|
1242 |
|
1243 |
# Handle web search if enabled
|
@@ -1264,7 +1303,7 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
|
|
1264 |
|
1265 |
messages.append({"role": "user", "content": message})
|
1266 |
|
1267 |
-
# Call GPT-4o-mini
|
1268 |
response = await client.chat.completions.create(
|
1269 |
model="gpt-4o-mini",
|
1270 |
messages=messages,
|
@@ -1272,8 +1311,14 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
|
|
1272 |
max_tokens=2000
|
1273 |
)
|
1274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1275 |
return {
|
1276 |
-
"response":
|
1277 |
"language": SUPPORTED_LANGUAGES.get(target_language, "") if target_language else ""
|
1278 |
}
|
1279 |
|
@@ -1309,7 +1354,8 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1309 |
self.audio_buffer = []
|
1310 |
self.is_recording = False
|
1311 |
self.silence_frames = 0
|
1312 |
-
self.silence_threshold =
|
|
|
1313 |
|
1314 |
print(f"Handler created with web_search_enabled={web_search_enabled}, "
|
1315 |
f"target_language={target_language}, webrtc_id={webrtc_id}, "
|
@@ -1409,21 +1455,32 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1409 |
# 2. Translate with GPT-4o-mini
|
1410 |
target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
|
1411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1412 |
print(f"[INTERPRETATION] Translating to {target_lang_name}...")
|
1413 |
translation_response = await self.client.chat.completions.create(
|
1414 |
model="gpt-4o-mini",
|
1415 |
messages=[
|
1416 |
{
|
1417 |
"role": "system",
|
1418 |
-
"content":
|
1419 |
-
f"Provide only the translation, nothing else."
|
1420 |
},
|
1421 |
{
|
1422 |
"role": "user",
|
1423 |
"content": user_text
|
1424 |
}
|
1425 |
],
|
1426 |
-
temperature=0.
|
1427 |
max_tokens=200
|
1428 |
)
|
1429 |
|
@@ -1432,9 +1489,25 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1432 |
|
1433 |
# 3. Generate speech with TTS
|
1434 |
print("[INTERPRETATION] Generating speech...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1435 |
tts_response = await self.client.audio.speech.create(
|
1436 |
model="tts-1",
|
1437 |
-
voice=
|
1438 |
input=translated_text,
|
1439 |
response_format="pcm", # PCM format for direct playback
|
1440 |
speed=1.0
|
@@ -1445,10 +1518,10 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1445 |
async for chunk in tts_response.iter_bytes(1024):
|
1446 |
audio_bytes += chunk
|
1447 |
|
1448 |
-
# Convert PCM to numpy array
|
1449 |
audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
|
1450 |
|
1451 |
-
#
|
1452 |
if len(audio_array) > 0:
|
1453 |
# Split audio into chunks and send
|
1454 |
chunk_size = 480 # Match our frame size
|
@@ -1472,6 +1545,9 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1472 |
|
1473 |
except Exception as e:
|
1474 |
print(f"[INTERPRETATION] Error: {e}")
|
|
|
|
|
|
|
1475 |
# Send error message to client
|
1476 |
error_data = {
|
1477 |
"event": type('Event', (), {
|
@@ -1531,6 +1607,15 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1531 |
print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
|
1532 |
print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
|
1533 |
# Just keep the handler ready to process audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1534 |
return
|
1535 |
|
1536 |
# Normal mode - connect to Realtime API
|
@@ -1539,7 +1624,68 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1539 |
base_instructions = self.system_prompt or "You are a helpful assistant."
|
1540 |
|
1541 |
# Add translation instructions if language is selected
|
1542 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1543 |
|
1544 |
if self.web_search_enabled and self.search_client:
|
1545 |
tools = [{
|
@@ -1574,10 +1720,12 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1574 |
"When in doubt, USE web_search. It's better to search and provide accurate information "
|
1575 |
"than to guess or use outdated information."
|
1576 |
)
|
1577 |
-
instructions = base_instructions +
|
1578 |
else:
|
1579 |
instructions = base_instructions + translation_instructions
|
1580 |
|
|
|
|
|
1581 |
async with self.client.beta.realtime.connect(
|
1582 |
model="gpt-4o-mini-realtime-preview-2024-12-17"
|
1583 |
) as conn:
|
@@ -1589,19 +1737,39 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1589 |
"tool_choice": "auto" if tools else "none"
|
1590 |
}
|
1591 |
|
1592 |
-
#
|
1593 |
if self.target_language:
|
1594 |
-
#
|
1595 |
voice_map = {
|
1596 |
-
"en": "
|
1597 |
-
"es": "nova",
|
1598 |
-
"fr": "
|
1599 |
-
"de": "
|
1600 |
-
"ja": "nova",
|
1601 |
-
"zh": "nova",
|
1602 |
-
#
|
1603 |
}
|
1604 |
-
session_update["voice"] = voice_map.get(self.target_language, "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1605 |
|
1606 |
await conn.session.update(session=session_update)
|
1607 |
self.connection = conn
|
@@ -1613,6 +1781,9 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1613 |
print(f"Function event: {event.type}")
|
1614 |
|
1615 |
if event.type == "response.audio_transcript.done":
|
|
|
|
|
|
|
1616 |
output_data = {
|
1617 |
"event": event,
|
1618 |
"language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
|
@@ -1684,20 +1855,21 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1684 |
# Simple voice activity detection
|
1685 |
audio_level = np.abs(array).mean()
|
1686 |
|
1687 |
-
if audio_level >
|
|
|
|
|
1688 |
self.is_recording = True
|
1689 |
self.silence_frames = 0
|
1690 |
self.audio_buffer.append(array)
|
1691 |
-
if len(self.audio_buffer) % 10 == 0: # Log every 10 frames
|
1692 |
-
print(f"[INTERPRETATION] Recording... buffer size: {len(self.audio_buffer)}, level: {audio_level:.1f}")
|
1693 |
elif self.is_recording:
|
1694 |
self.silence_frames += 1
|
1695 |
self.audio_buffer.append(array)
|
1696 |
|
1697 |
# If we've had enough silence, process the audio
|
1698 |
-
if self.silence_frames > self.silence_threshold:
|
1699 |
-
print(f"[INTERPRETATION] Silence detected
|
1700 |
-
|
|
|
1701 |
else:
|
1702 |
# Normal mode - use Realtime API
|
1703 |
if not self.connection:
|
@@ -1712,18 +1884,24 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1712 |
# Connection might be closed, ignore the error
|
1713 |
|
1714 |
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
|
1715 |
-
# In interpretation mode,
|
1716 |
-
if self.interpretation_mode
|
1717 |
-
|
1718 |
-
|
1719 |
-
|
1720 |
-
|
1721 |
-
|
1722 |
-
|
1723 |
-
|
1724 |
-
|
1725 |
-
|
1726 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1727 |
|
1728 |
async def shutdown(self) -> None:
|
1729 |
if self.interpretation_mode:
|
|
|
1211 |
# Initialize OpenAI client for text chat
|
1212 |
client = openai.AsyncOpenAI()
|
1213 |
|
1214 |
+
def get_translation_instructions(target_language: str) -> str:
|
1215 |
+
"""Get instructions for translation based on target language"""
|
1216 |
+
if not target_language:
|
1217 |
+
return ""
|
1218 |
+
|
1219 |
+
language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
|
1220 |
+
return (
|
1221 |
+
f"\n\nIMPORTANT: You must respond in {language_name} ({target_language}). "
|
1222 |
+
f"Translate all your responses to {language_name}."
|
1223 |
+
)
|
1224 |
+
|
1225 |
def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
|
1226 |
chatbot.append({"role": "assistant", "content": response.transcript})
|
1227 |
return chatbot
|
|
|
1245 |
try:
|
1246 |
# Prepare system message
|
1247 |
base_instructions = system_prompt or "You are a helpful assistant."
|
1248 |
+
|
1249 |
+
# Add strong language instructions if language is selected
|
1250 |
+
if target_language:
|
1251 |
+
language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
|
1252 |
+
|
1253 |
+
# Language-specific instructions
|
1254 |
+
if target_language == "en":
|
1255 |
+
lang_specific = "\nYou MUST respond in English ONLY. Never use Korean or any other language."
|
1256 |
+
elif target_language == "ja":
|
1257 |
+
lang_specific = "\n日本語でのみ応答してください。韓国語や他の言語は使用しないでください。"
|
1258 |
+
elif target_language == "zh":
|
1259 |
+
lang_specific = "\n只能用中文回答。不要使用韩语或其他任何语言。"
|
1260 |
+
elif target_language == "es":
|
1261 |
+
lang_specific = "\nDebe responder SOLO en español. Nunca use coreano u otros idiomas."
|
1262 |
+
elif target_language == "fr":
|
1263 |
+
lang_specific = "\nVous devez répondre UNIQUEMENT en français. N'utilisez jamais le coréen ou d'autres langues."
|
1264 |
+
elif target_language == "de":
|
1265 |
+
lang_specific = "\nSie müssen NUR auf Deutsch antworten. Verwenden Sie niemals Koreanisch oder andere Sprachen."
|
1266 |
+
else:
|
1267 |
+
lang_specific = f"\nYou MUST respond ONLY in {language_name}. Never use any other language."
|
1268 |
+
|
1269 |
+
translation_instructions = (
|
1270 |
+
f"\n\nIMPORTANT: Your response language is set to {language_name} ({target_language})."
|
1271 |
+
f"{lang_specific}"
|
1272 |
+
f"\nEven if the user writes in Korean or another language, you must ALWAYS respond in {language_name}."
|
1273 |
+
f"\nThis is a strict requirement. Output language: {language_name} ONLY."
|
1274 |
+
)
|
1275 |
+
|
1276 |
+
base_instructions = base_instructions + translation_instructions
|
1277 |
|
1278 |
messages = [
|
1279 |
+
{"role": "system", "content": base_instructions}
|
1280 |
]
|
1281 |
|
1282 |
# Handle web search if enabled
|
|
|
1303 |
|
1304 |
messages.append({"role": "user", "content": message})
|
1305 |
|
1306 |
+
# Call GPT-4o-mini with strong language enforcement
|
1307 |
response = await client.chat.completions.create(
|
1308 |
model="gpt-4o-mini",
|
1309 |
messages=messages,
|
|
|
1311 |
max_tokens=2000
|
1312 |
)
|
1313 |
|
1314 |
+
response_text = response.choices[0].message.content
|
1315 |
+
|
1316 |
+
# Debug logging
|
1317 |
+
print(f"[TEXT CHAT] Target language: {target_language}")
|
1318 |
+
print(f"[TEXT CHAT] Response preview: {response_text[:100]}...")
|
1319 |
+
|
1320 |
return {
|
1321 |
+
"response": response_text,
|
1322 |
"language": SUPPORTED_LANGUAGES.get(target_language, "") if target_language else ""
|
1323 |
}
|
1324 |
|
|
|
1354 |
self.audio_buffer = []
|
1355 |
self.is_recording = False
|
1356 |
self.silence_frames = 0
|
1357 |
+
self.silence_threshold = 20 # Reduced for faster response (20 frames = ~0.4 seconds)
|
1358 |
+
self.min_audio_length = 10 # Minimum frames to consider as speech
|
1359 |
|
1360 |
print(f"Handler created with web_search_enabled={web_search_enabled}, "
|
1361 |
f"target_language={target_language}, webrtc_id={webrtc_id}, "
|
|
|
1455 |
# 2. Translate with GPT-4o-mini
|
1456 |
target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
|
1457 |
|
1458 |
+
# Create very specific translation prompt
|
1459 |
+
if self.interpretation_language == "en":
|
1460 |
+
system_prompt = "You are a translator. Translate Korean to English. Output ONLY the English translation, nothing else."
|
1461 |
+
elif self.interpretation_language == "ja":
|
1462 |
+
system_prompt = "You are a translator. Translate Korean to Japanese. Output ONLY the Japanese translation (日本語のみ), nothing else."
|
1463 |
+
elif self.interpretation_language == "zh":
|
1464 |
+
system_prompt = "You are a translator. Translate Korean to Chinese. Output ONLY the Chinese translation (只输出中文), nothing else."
|
1465 |
+
elif self.interpretation_language == "es":
|
1466 |
+
system_prompt = "You are a translator. Translate Korean to Spanish. Output ONLY the Spanish translation (solo español), nothing else."
|
1467 |
+
else:
|
1468 |
+
system_prompt = f"You are a translator. Translate Korean to {target_lang_name}. Output ONLY the {target_lang_name} translation, nothing else."
|
1469 |
+
|
1470 |
print(f"[INTERPRETATION] Translating to {target_lang_name}...")
|
1471 |
translation_response = await self.client.chat.completions.create(
|
1472 |
model="gpt-4o-mini",
|
1473 |
messages=[
|
1474 |
{
|
1475 |
"role": "system",
|
1476 |
+
"content": system_prompt
|
|
|
1477 |
},
|
1478 |
{
|
1479 |
"role": "user",
|
1480 |
"content": user_text
|
1481 |
}
|
1482 |
],
|
1483 |
+
temperature=0.1, # Lower temperature for more literal translation
|
1484 |
max_tokens=200
|
1485 |
)
|
1486 |
|
|
|
1489 |
|
1490 |
# 3. Generate speech with TTS
|
1491 |
print("[INTERPRETATION] Generating speech...")
|
1492 |
+
|
1493 |
+
# Select appropriate voice for the language
|
1494 |
+
# Using voices that work better for each language
|
1495 |
+
voice_map = {
|
1496 |
+
"en": "nova", # Nova has clear English pronunciation
|
1497 |
+
"es": "nova", # Nova works well for Spanish
|
1498 |
+
"fr": "shimmer", # Shimmer works well for French
|
1499 |
+
"de": "onyx", # Onyx works well for German
|
1500 |
+
"ja": "nova", # Nova can handle Japanese
|
1501 |
+
"zh": "nova", # Nova can handle Chinese
|
1502 |
+
"ko": "nova", # Nova can handle Korean
|
1503 |
+
}
|
1504 |
+
selected_voice = voice_map.get(self.interpretation_language, "nova")
|
1505 |
+
|
1506 |
+
print(f"[INTERPRETATION] Using voice: {selected_voice} for language: {self.interpretation_language}")
|
1507 |
+
|
1508 |
tts_response = await self.client.audio.speech.create(
|
1509 |
model="tts-1",
|
1510 |
+
voice=selected_voice,
|
1511 |
input=translated_text,
|
1512 |
response_format="pcm", # PCM format for direct playback
|
1513 |
speed=1.0
|
|
|
1518 |
async for chunk in tts_response.iter_bytes(1024):
|
1519 |
audio_bytes += chunk
|
1520 |
|
1521 |
+
# Convert PCM to numpy array (TTS outputs at 24kHz)
|
1522 |
audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
|
1523 |
|
1524 |
+
# Send audio in chunks
|
1525 |
if len(audio_array) > 0:
|
1526 |
# Split audio into chunks and send
|
1527 |
chunk_size = 480 # Match our frame size
|
|
|
1545 |
|
1546 |
except Exception as e:
|
1547 |
print(f"[INTERPRETATION] Error: {e}")
|
1548 |
+
import traceback
|
1549 |
+
traceback.print_exc()
|
1550 |
+
|
1551 |
# Send error message to client
|
1552 |
error_data = {
|
1553 |
"event": type('Event', (), {
|
|
|
1607 |
print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
|
1608 |
print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
|
1609 |
# Just keep the handler ready to process audio
|
1610 |
+
# Create an infinite loop to keep the handler alive
|
1611 |
+
try:
|
1612 |
+
while True:
|
1613 |
+
await asyncio.sleep(0.1)
|
1614 |
+
# Check if we need to process any audio
|
1615 |
+
if self.is_recording and self.silence_frames > self.silence_threshold:
|
1616 |
+
await self.process_interpretation()
|
1617 |
+
except asyncio.CancelledError:
|
1618 |
+
print("[INTERPRETATION MODE] Handler cancelled")
|
1619 |
return
|
1620 |
|
1621 |
# Normal mode - connect to Realtime API
|
|
|
1624 |
base_instructions = self.system_prompt or "You are a helpful assistant."
|
1625 |
|
1626 |
# Add translation instructions if language is selected
|
1627 |
+
if self.target_language:
|
1628 |
+
language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
|
1629 |
+
|
1630 |
+
# Create base translation instruction
|
1631 |
+
base_translation = f"You MUST respond ONLY in {language_name}."
|
1632 |
+
|
1633 |
+
# Add language-specific instructions with examples
|
1634 |
+
if self.target_language == "en":
|
1635 |
+
translation_instructions = f"""
|
1636 |
+
{base_translation}
|
1637 |
+
|
1638 |
+
CRITICAL RULES:
|
1639 |
+
1. EVERY word you say must be in English.
|
1640 |
+
2. Do NOT use Korean (한국어) at all.
|
1641 |
+
3. If user speaks Korean, understand it but ALWAYS reply in English.
|
1642 |
+
4. Example: User says "안녕하세요" → You say "Hello! How can I help you today?"
|
1643 |
+
5. Your language mode is: ENGLISH ONLY.
|
1644 |
+
"""
|
1645 |
+
elif self.target_language == "ja":
|
1646 |
+
translation_instructions = f"""
|
1647 |
+
{base_translation}
|
1648 |
+
|
1649 |
+
重要なルール:
|
1650 |
+
1. 必ず日本語のみで応答してください。
|
1651 |
+
2. 韓国語は一切使用しないでください。
|
1652 |
+
3. ユーザーが韓国語で話しても、必ず日本語で返答してください。
|
1653 |
+
4. 例:ユーザーが「안녕하세요」と言ったら → あなたは「こんにちは!今日はどのようにお手伝いできますか?」と言う
|
1654 |
+
5. 言語モード:日本語のみ
|
1655 |
+
"""
|
1656 |
+
elif self.target_language == "zh":
|
1657 |
+
translation_instructions = f"""
|
1658 |
+
{base_translation}
|
1659 |
+
|
1660 |
+
重要规则:
|
1661 |
+
1. 必须只用中文回答。
|
1662 |
+
2. 绝对不要使用韩语。
|
1663 |
+
3. 即使用户说韩语,也必须用中文回复。
|
1664 |
+
4. 例如:用户说"안녕하세요" → 你说"你好!我能为您做什么?"
|
1665 |
+
5. 语言模式:仅中文
|
1666 |
+
"""
|
1667 |
+
elif self.target_language == "es":
|
1668 |
+
translation_instructions = f"""
|
1669 |
+
{base_translation}
|
1670 |
+
|
1671 |
+
REGLAS CRÍTICAS:
|
1672 |
+
1. TODAS tus palabras deben estar en español.
|
1673 |
+
2. NO uses coreano en absoluto.
|
1674 |
+
3. Si el usuario habla coreano, entiéndelo pero SIEMPRE responde en español.
|
1675 |
+
4. Ejemplo: Usuario dice "안녕하세요" → Tú dices "¡Hola! ¿Cómo puedo ayudarte hoy?"
|
1676 |
+
5. Modo de idioma: SOLO ESPAÑOL
|
1677 |
+
"""
|
1678 |
+
else:
|
1679 |
+
translation_instructions = f"""
|
1680 |
+
{base_translation}
|
1681 |
+
|
1682 |
+
RULES:
|
1683 |
+
1. You must ONLY speak in {language_name}.
|
1684 |
+
2. NEVER use Korean or any other language.
|
1685 |
+
3. Always respond in {language_name} regardless of what language the user speaks.
|
1686 |
+
"""
|
1687 |
+
else:
|
1688 |
+
translation_instructions = ""
|
1689 |
|
1690 |
if self.web_search_enabled and self.search_client:
|
1691 |
tools = [{
|
|
|
1720 |
"When in doubt, USE web_search. It's better to search and provide accurate information "
|
1721 |
"than to guess or use outdated information."
|
1722 |
)
|
1723 |
+
instructions = base_instructions + translation_instructions + search_instructions
|
1724 |
else:
|
1725 |
instructions = base_instructions + translation_instructions
|
1726 |
|
1727 |
+
print(f"[NORMAL MODE] Instructions: {instructions[:200]}...")
|
1728 |
+
|
1729 |
async with self.client.beta.realtime.connect(
|
1730 |
model="gpt-4o-mini-realtime-preview-2024-12-17"
|
1731 |
) as conn:
|
|
|
1737 |
"tool_choice": "auto" if tools else "none"
|
1738 |
}
|
1739 |
|
1740 |
+
# Use appropriate voice for the language
|
1741 |
if self.target_language:
|
1742 |
+
# Use voice that works better for each language
|
1743 |
voice_map = {
|
1744 |
+
"en": "nova", # Nova has clear pronunciation
|
1745 |
+
"es": "nova", # Nova works well for Spanish
|
1746 |
+
"fr": "shimmer", # Shimmer for French
|
1747 |
+
"de": "onyx", # Onyx for German
|
1748 |
+
"ja": "nova", # Nova can handle Japanese
|
1749 |
+
"zh": "nova", # Nova can handle Chinese
|
1750 |
+
"ko": "nova", # Nova can handle Korean
|
1751 |
}
|
1752 |
+
session_update["voice"] = voice_map.get(self.target_language, "nova")
|
1753 |
+
|
1754 |
+
# Force output language settings
|
1755 |
+
session_update["modalities"] = ["text", "audio"]
|
1756 |
+
session_update["output_audio_format"] = "pcm16"
|
1757 |
+
|
1758 |
+
# Add extra language enforcement in system message
|
1759 |
+
if self.target_language == "en":
|
1760 |
+
extra_instruction = "\n\nREMINDER: Speak in English only. 英語のみで話してください。"
|
1761 |
+
elif self.target_language == "ja":
|
1762 |
+
extra_instruction = "\n\nREMINDER: 日本語のみで話してください。Speak in Japanese only."
|
1763 |
+
elif self.target_language == "zh":
|
1764 |
+
extra_instruction = "\n\nREMINDER: 只说中文。Speak in Chinese only."
|
1765 |
+
else:
|
1766 |
+
extra_instruction = ""
|
1767 |
+
|
1768 |
+
session_update["instructions"] = instructions + extra_instruction
|
1769 |
+
|
1770 |
+
print(f"[TRANSLATION MODE] Target language: {self.target_language}")
|
1771 |
+
print(f"[TRANSLATION MODE] Voice: {session_update['voice']}")
|
1772 |
+
print(f"[TRANSLATION MODE] Instructions preview: {session_update['instructions'][:200]}...")
|
1773 |
|
1774 |
await conn.session.update(session=session_update)
|
1775 |
self.connection = conn
|
|
|
1781 |
print(f"Function event: {event.type}")
|
1782 |
|
1783 |
if event.type == "response.audio_transcript.done":
|
1784 |
+
print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
|
1785 |
+
print(f"[RESPONSE] Expected language: {self.target_language}")
|
1786 |
+
|
1787 |
output_data = {
|
1788 |
"event": event,
|
1789 |
"language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
|
|
|
1855 |
# Simple voice activity detection
|
1856 |
audio_level = np.abs(array).mean()
|
1857 |
|
1858 |
+
if audio_level > 200: # Lower threshold for better detection
|
1859 |
+
if not self.is_recording:
|
1860 |
+
print(f"[INTERPRETATION] Started recording, level: {audio_level:.1f}")
|
1861 |
self.is_recording = True
|
1862 |
self.silence_frames = 0
|
1863 |
self.audio_buffer.append(array)
|
|
|
|
|
1864 |
elif self.is_recording:
|
1865 |
self.silence_frames += 1
|
1866 |
self.audio_buffer.append(array)
|
1867 |
|
1868 |
# If we've had enough silence, process the audio
|
1869 |
+
if self.silence_frames > self.silence_threshold and len(self.audio_buffer) > self.min_audio_length:
|
1870 |
+
print(f"[INTERPRETATION] Silence detected after {len(self.audio_buffer)} frames")
|
1871 |
+
# Process in the background to avoid blocking
|
1872 |
+
asyncio.create_task(self.process_interpretation())
|
1873 |
else:
|
1874 |
# Normal mode - use Realtime API
|
1875 |
if not self.connection:
|
|
|
1884 |
# Connection might be closed, ignore the error
|
1885 |
|
1886 |
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
|
1887 |
+
# In interpretation mode, we need to keep checking for audio
|
1888 |
+
if self.interpretation_mode:
|
1889 |
+
# Use a timeout to prevent blocking forever
|
1890 |
+
try:
|
1891 |
+
item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.1)
|
1892 |
+
return item
|
1893 |
+
except asyncio.TimeoutError:
|
1894 |
+
return None
|
1895 |
+
else:
|
1896 |
+
# Normal mode
|
1897 |
+
item = await wait_for_item(self.output_queue)
|
1898 |
+
|
1899 |
+
# Check if it's a dict with text message
|
1900 |
+
if isinstance(item, dict) and item.get('type') == 'text_message':
|
1901 |
+
await self.process_text_message(item['content'])
|
1902 |
+
return None
|
1903 |
+
|
1904 |
+
return item
|
1905 |
|
1906 |
async def shutdown(self) -> None:
|
1907 |
if self.interpretation_mode:
|