Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1243,37 +1243,29 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
|
|
1243 |
system_prompt: str) -> Dict[str, str]:
|
1244 |
"""Process text chat using GPT-4o-mini model"""
|
1245 |
try:
|
1246 |
-
#
|
1247 |
-
base_instructions = system_prompt or "You are a helpful assistant."
|
1248 |
-
|
1249 |
-
# Add strong language instructions if language is selected
|
1250 |
if target_language:
|
1251 |
language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
|
1252 |
|
1253 |
-
#
|
1254 |
if target_language == "en":
|
1255 |
-
|
|
|
1256 |
elif target_language == "ja":
|
1257 |
-
|
|
|
1258 |
elif target_language == "zh":
|
1259 |
-
|
|
|
1260 |
elif target_language == "es":
|
1261 |
-
|
1262 |
-
|
1263 |
-
lang_specific = "\nVous devez répondre UNIQUEMENT en français. N'utilisez jamais le coréen ou d'autres langues."
|
1264 |
-
elif target_language == "de":
|
1265 |
-
lang_specific = "\nSie müssen NUR auf Deutsch antworten. Verwenden Sie niemals Koreanisch oder andere Sprachen."
|
1266 |
else:
|
1267 |
-
|
1268 |
-
|
1269 |
-
|
1270 |
-
|
1271 |
-
|
1272 |
-
f"\nEven if the user writes in Korean or another language, you must ALWAYS respond in {language_name}."
|
1273 |
-
f"\nThis is a strict requirement. Output language: {language_name} ONLY."
|
1274 |
-
)
|
1275 |
-
|
1276 |
-
base_instructions = base_instructions + translation_instructions
|
1277 |
|
1278 |
messages = [
|
1279 |
{"role": "system", "content": base_instructions}
|
@@ -1296,14 +1288,21 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
|
|
1296 |
for i, result in enumerate(search_results[:5], 1):
|
1297 |
search_context += f"{i}. {result['title']}\n{result['description']}\n\n"
|
1298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1299 |
messages.append({
|
1300 |
"role": "system",
|
1301 |
-
"content":
|
1302 |
})
|
1303 |
|
1304 |
-
|
|
|
1305 |
|
1306 |
-
# Call GPT-4o-mini
|
1307 |
response = await client.chat.completions.create(
|
1308 |
model="gpt-4o-mini",
|
1309 |
messages=messages,
|
@@ -1313,7 +1312,21 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
|
|
1313 |
|
1314 |
response_text = response.choices[0].message.content
|
1315 |
|
1316 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1317 |
print(f"[TEXT CHAT] Target language: {target_language}")
|
1318 |
print(f"[TEXT CHAT] Response preview: {response_text[:100]}...")
|
1319 |
|
@@ -1455,19 +1468,51 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1455 |
# 2. Translate with GPT-4o-mini
|
1456 |
target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
|
1457 |
|
1458 |
-
# Create very
|
1459 |
-
|
1460 |
-
|
1461 |
-
|
1462 |
-
|
1463 |
-
|
1464 |
-
|
1465 |
-
|
1466 |
-
|
1467 |
-
|
1468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1469 |
|
1470 |
print(f"[INTERPRETATION] Translating to {target_lang_name}...")
|
|
|
|
|
1471 |
translation_response = await self.client.chat.completions.create(
|
1472 |
model="gpt-4o-mini",
|
1473 |
messages=[
|
@@ -1477,26 +1522,33 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1477 |
},
|
1478 |
{
|
1479 |
"role": "user",
|
1480 |
-
"content": user_text
|
1481 |
}
|
1482 |
],
|
1483 |
-
temperature=0.1, #
|
1484 |
max_tokens=200
|
1485 |
)
|
1486 |
|
1487 |
translated_text = translation_response.choices[0].message.content.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1488 |
print(f"[INTERPRETATION] Translated: {translated_text}")
|
1489 |
|
1490 |
# 3. Generate speech with TTS
|
1491 |
-
print("[INTERPRETATION] Generating speech
|
1492 |
|
1493 |
-
# Select appropriate voice
|
1494 |
-
# Using voices that work better for each language
|
1495 |
voice_map = {
|
1496 |
-
"en": "
|
1497 |
-
"es": "nova", # Nova
|
1498 |
-
"fr": "shimmer", # Shimmer
|
1499 |
-
"de": "
|
1500 |
"ja": "nova", # Nova can handle Japanese
|
1501 |
"zh": "nova", # Nova can handle Chinese
|
1502 |
"ko": "nova", # Nova can handle Korean
|
@@ -1505,13 +1557,29 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1505 |
|
1506 |
print(f"[INTERPRETATION] Using voice: {selected_voice} for language: {self.interpretation_language}")
|
1507 |
|
1508 |
-
|
1509 |
-
|
1510 |
-
|
1511 |
-
|
1512 |
-
|
1513 |
-
|
1514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1515 |
|
1516 |
# Convert response to bytes
|
1517 |
audio_bytes = b""
|
@@ -1607,15 +1675,8 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1607 |
print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
|
1608 |
print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
|
1609 |
# Just keep the handler ready to process audio
|
1610 |
-
#
|
1611 |
-
|
1612 |
-
while True:
|
1613 |
-
await asyncio.sleep(0.1)
|
1614 |
-
# Check if we need to process any audio
|
1615 |
-
if self.is_recording and self.silence_frames > self.silence_threshold:
|
1616 |
-
await self.process_interpretation()
|
1617 |
-
except asyncio.CancelledError:
|
1618 |
-
print("[INTERPRETATION MODE] Handler cancelled")
|
1619 |
return
|
1620 |
|
1621 |
# Normal mode - connect to Realtime API
|
@@ -1627,63 +1688,85 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1627 |
if self.target_language:
|
1628 |
language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
|
1629 |
|
1630 |
-
#
|
1631 |
-
base_translation = f"You MUST respond ONLY in {language_name}."
|
1632 |
-
|
1633 |
-
# Add language-specific instructions with examples
|
1634 |
if self.target_language == "en":
|
1635 |
-
translation_instructions =
|
1636 |
-
|
1637 |
|
1638 |
-
|
1639 |
-
1.
|
1640 |
-
2.
|
1641 |
-
3.
|
1642 |
-
4.
|
1643 |
-
5.
|
|
|
|
|
|
|
|
|
1644 |
"""
|
|
|
|
|
|
|
1645 |
elif self.target_language == "ja":
|
1646 |
-
translation_instructions =
|
1647 |
-
|
1648 |
|
1649 |
-
|
1650 |
-
1.
|
1651 |
-
2.
|
1652 |
-
3.
|
1653 |
-
4.
|
1654 |
-
5.
|
|
|
|
|
|
|
|
|
1655 |
"""
|
|
|
|
|
1656 |
elif self.target_language == "zh":
|
1657 |
-
translation_instructions =
|
1658 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1659 |
|
1660 |
-
|
1661 |
-
|
1662 |
-
|
1663 |
-
3. 即使用户说韩语,也必须用中文回复。
|
1664 |
-
4. 例如:用户说"안녕하세요" → 你说"你好!我能为您做什么?"
|
1665 |
-
5. 语言模式:仅中文
|
1666 |
"""
|
|
|
|
|
1667 |
elif self.target_language == "es":
|
1668 |
-
translation_instructions =
|
1669 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1670 |
|
1671 |
-
|
1672 |
-
|
1673 |
-
|
1674 |
-
3. Si el usuario habla coreano, entiéndelo pero SIEMPRE responde en español.
|
1675 |
-
4. Ejemplo: Usuario dice "안녕하세요" → Tú dices "¡Hola! ¿Cómo puedo ayudarte hoy?"
|
1676 |
-
5. Modo de idioma: SOLO ESPAÑOL
|
1677 |
"""
|
|
|
1678 |
else:
|
1679 |
translation_instructions = f"""
|
1680 |
-
{
|
1681 |
|
1682 |
RULES:
|
1683 |
-
1.
|
1684 |
-
2.
|
1685 |
-
3. Always respond in {language_name}
|
1686 |
"""
|
|
|
1687 |
else:
|
1688 |
translation_instructions = ""
|
1689 |
|
@@ -1720,11 +1803,24 @@ RULES:
|
|
1720 |
"When in doubt, USE web_search. It's better to search and provide accurate information "
|
1721 |
"than to guess or use outdated information."
|
1722 |
)
|
1723 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1724 |
else:
|
1725 |
-
|
|
|
|
|
|
|
|
|
1726 |
|
1727 |
-
print(f"[NORMAL MODE]
|
|
|
|
|
|
|
1728 |
|
1729 |
async with self.client.beta.realtime.connect(
|
1730 |
model="gpt-4o-mini-realtime-preview-2024-12-17"
|
@@ -1734,42 +1830,39 @@ RULES:
|
|
1734 |
"turn_detection": {"type": "server_vad"},
|
1735 |
"instructions": instructions,
|
1736 |
"tools": tools,
|
1737 |
-
"tool_choice": "auto" if tools else "none"
|
|
|
|
|
|
|
|
|
1738 |
}
|
1739 |
|
1740 |
# Use appropriate voice for the language
|
1741 |
if self.target_language:
|
1742 |
-
#
|
|
|
1743 |
voice_map = {
|
1744 |
-
"en": "nova", # Nova has
|
1745 |
-
"es": "nova", # Nova works
|
1746 |
"fr": "shimmer", # Shimmer for French
|
1747 |
-
"de": "
|
1748 |
-
"ja": "
|
1749 |
-
"zh": "
|
1750 |
-
"ko": "nova", # Nova
|
1751 |
}
|
1752 |
session_update["voice"] = voice_map.get(self.target_language, "nova")
|
1753 |
|
1754 |
-
#
|
1755 |
session_update["modalities"] = ["text", "audio"]
|
1756 |
-
session_update["output_audio_format"] = "pcm16"
|
1757 |
|
1758 |
-
#
|
1759 |
-
|
1760 |
-
extra_instruction = "\n\nREMINDER: Speak in English only. 英語のみで話してください。"
|
1761 |
-
elif self.target_language == "ja":
|
1762 |
-
extra_instruction = "\n\nREMINDER: 日本語のみで話してください。Speak in Japanese only."
|
1763 |
-
elif self.target_language == "zh":
|
1764 |
-
extra_instruction = "\n\nREMINDER: 只说中文。Speak in Chinese only."
|
1765 |
-
else:
|
1766 |
-
extra_instruction = ""
|
1767 |
|
1768 |
-
|
|
|
|
|
1769 |
|
1770 |
-
print(f"[TRANSLATION MODE]
|
1771 |
-
print(f"[TRANSLATION MODE] Voice: {session_update['voice']}")
|
1772 |
-
print(f"[TRANSLATION MODE] Instructions preview: {session_update['instructions'][:200]}...")
|
1773 |
|
1774 |
await conn.session.update(session=session_update)
|
1775 |
self.connection = conn
|
|
|
1243 |
system_prompt: str) -> Dict[str, str]:
|
1244 |
"""Process text chat using GPT-4o-mini model"""
|
1245 |
try:
|
1246 |
+
# If target language is set, override system prompt completely
|
|
|
|
|
|
|
1247 |
if target_language:
|
1248 |
language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
|
1249 |
|
1250 |
+
# Create system prompt in target language
|
1251 |
if target_language == "en":
|
1252 |
+
base_instructions = f"You are a helpful assistant. You speak ONLY English. Never use Korean or any other language. {system_prompt}"
|
1253 |
+
user_prefix = "Please respond in English: "
|
1254 |
elif target_language == "ja":
|
1255 |
+
base_instructions = f"あなたは親切なアシスタントです。日本語のみを話します。韓国語や他の言語は絶対に使用しません。{system_prompt}"
|
1256 |
+
user_prefix = "日本語で答えてください: "
|
1257 |
elif target_language == "zh":
|
1258 |
+
base_instructions = f"你是一个乐于助人的助手。你只说中文。绝不使用韩语或其他语言。{system_prompt}"
|
1259 |
+
user_prefix = "请用中文回答: "
|
1260 |
elif target_language == "es":
|
1261 |
+
base_instructions = f"Eres un asistente útil. Solo hablas español. Nunca uses coreano u otros idiomas. {system_prompt}"
|
1262 |
+
user_prefix = "Por favor responde en español: "
|
|
|
|
|
|
|
1263 |
else:
|
1264 |
+
base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}. {system_prompt}"
|
1265 |
+
user_prefix = f"Please respond in {language_name}: "
|
1266 |
+
else:
|
1267 |
+
base_instructions = system_prompt or "You are a helpful assistant."
|
1268 |
+
user_prefix = ""
|
|
|
|
|
|
|
|
|
|
|
1269 |
|
1270 |
messages = [
|
1271 |
{"role": "system", "content": base_instructions}
|
|
|
1288 |
for i, result in enumerate(search_results[:5], 1):
|
1289 |
search_context += f"{i}. {result['title']}\n{result['description']}\n\n"
|
1290 |
|
1291 |
+
# Add search context in target language if set
|
1292 |
+
if target_language:
|
1293 |
+
search_instruction = f"Use this search information but respond in {SUPPORTED_LANGUAGES.get(target_language, target_language)} only: "
|
1294 |
+
else:
|
1295 |
+
search_instruction = "다음 웹 검색 결과를 참고하여 답변하세요: "
|
1296 |
+
|
1297 |
messages.append({
|
1298 |
"role": "system",
|
1299 |
+
"content": search_instruction + "\n\n" + search_context
|
1300 |
})
|
1301 |
|
1302 |
+
# Add user message with language prefix
|
1303 |
+
messages.append({"role": "user", "content": user_prefix + message})
|
1304 |
|
1305 |
+
# Call GPT-4o-mini
|
1306 |
response = await client.chat.completions.create(
|
1307 |
model="gpt-4o-mini",
|
1308 |
messages=messages,
|
|
|
1312 |
|
1313 |
response_text = response.choices[0].message.content
|
1314 |
|
1315 |
+
# Final check - remove any Korean if target language is not Korean
|
1316 |
+
if target_language and target_language != "ko":
|
1317 |
+
import re
|
1318 |
+
if re.search(r'[가-힣]', response_text):
|
1319 |
+
print(f"[TEXT CHAT] WARNING: Korean detected in response for {target_language}")
|
1320 |
+
# Try again with stronger prompt
|
1321 |
+
messages[-1] = {"role": "user", "content": f"ONLY {SUPPORTED_LANGUAGES.get(target_language, target_language)}, NO KOREAN: {message}"}
|
1322 |
+
retry_response = await client.chat.completions.create(
|
1323 |
+
model="gpt-4o-mini",
|
1324 |
+
messages=messages,
|
1325 |
+
temperature=0.3,
|
1326 |
+
max_tokens=2000
|
1327 |
+
)
|
1328 |
+
response_text = retry_response.choices[0].message.content
|
1329 |
+
|
1330 |
print(f"[TEXT CHAT] Target language: {target_language}")
|
1331 |
print(f"[TEXT CHAT] Response preview: {response_text[:100]}...")
|
1332 |
|
|
|
1468 |
# 2. Translate with GPT-4o-mini
|
1469 |
target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
|
1470 |
|
1471 |
+
# Create very explicit translation examples
|
1472 |
+
translation_examples = {
|
1473 |
+
"en": {
|
1474 |
+
"안녕하세요": "Hello",
|
1475 |
+
"감사합니다": "Thank you",
|
1476 |
+
"오늘 날씨가 좋네요": "The weather is nice today"
|
1477 |
+
},
|
1478 |
+
"ja": {
|
1479 |
+
"안녕하세요": "こんにちは",
|
1480 |
+
"감사합니다": "ありがとうございます",
|
1481 |
+
"오늘 날씨가 좋네요": "今日はいい天気ですね"
|
1482 |
+
},
|
1483 |
+
"zh": {
|
1484 |
+
"안녕하세요": "你好",
|
1485 |
+
"감사합니다": "谢谢",
|
1486 |
+
"오늘 날씨가 좋네요": "今天天气很好"
|
1487 |
+
},
|
1488 |
+
"es": {
|
1489 |
+
"안녕하세요": "Hola",
|
1490 |
+
"감사합니다": "Gracias",
|
1491 |
+
"오늘 날씨가 좋네요": "El clima está agradable hoy"
|
1492 |
+
}
|
1493 |
+
}
|
1494 |
+
|
1495 |
+
examples = translation_examples.get(self.interpretation_language, translation_examples["en"])
|
1496 |
+
examples_text = "\n".join([f'"{k}" → "{v}"' for k, v in examples.items()])
|
1497 |
+
|
1498 |
+
# Ultra-specific prompt
|
1499 |
+
system_prompt = f"""You are a Korean to {target_lang_name} translator.
|
1500 |
+
|
1501 |
+
STRICT RULES:
|
1502 |
+
1. Output ONLY the {target_lang_name} translation
|
1503 |
+
2. Do NOT output Korean
|
1504 |
+
3. Do NOT add explanations
|
1505 |
+
4. Do NOT answer questions
|
1506 |
+
5. Just translate
|
1507 |
+
|
1508 |
+
Examples:
|
1509 |
+
{examples_text}
|
1510 |
+
|
1511 |
+
Now translate the Korean text to {target_lang_name}. Output ONLY the translation in {target_lang_name}:"""
|
1512 |
|
1513 |
print(f"[INTERPRETATION] Translating to {target_lang_name}...")
|
1514 |
+
print(f"[INTERPRETATION] System prompt: {system_prompt}")
|
1515 |
+
|
1516 |
translation_response = await self.client.chat.completions.create(
|
1517 |
model="gpt-4o-mini",
|
1518 |
messages=[
|
|
|
1522 |
},
|
1523 |
{
|
1524 |
"role": "user",
|
1525 |
+
"content": f"Translate this Korean to {target_lang_name}: {user_text}"
|
1526 |
}
|
1527 |
],
|
1528 |
+
temperature=0.1, # Very low temperature
|
1529 |
max_tokens=200
|
1530 |
)
|
1531 |
|
1532 |
translated_text = translation_response.choices[0].message.content.strip()
|
1533 |
+
|
1534 |
+
# Remove any Korean characters if they accidentally appear
|
1535 |
+
import re
|
1536 |
+
if re.search(r'[가-힣]', translated_text):
|
1537 |
+
print(f"[INTERPRETATION] WARNING: Korean characters detected in translation: {translated_text}")
|
1538 |
+
# Try to extract only non-Korean parts
|
1539 |
+
translated_text = re.sub(r'[가-힣\s]+', ' ', translated_text).strip()
|
1540 |
+
|
1541 |
print(f"[INTERPRETATION] Translated: {translated_text}")
|
1542 |
|
1543 |
# 3. Generate speech with TTS
|
1544 |
+
print(f"[INTERPRETATION] Generating speech for text: {translated_text}")
|
1545 |
|
1546 |
+
# Select appropriate voice and ensure it speaks the target language
|
|
|
1547 |
voice_map = {
|
1548 |
+
"en": "alloy", # Alloy is native English speaker
|
1549 |
+
"es": "nova", # Nova handles Spanish well
|
1550 |
+
"fr": "shimmer", # Shimmer handles French well
|
1551 |
+
"de": "echo", # Echo handles German well
|
1552 |
"ja": "nova", # Nova can handle Japanese
|
1553 |
"zh": "nova", # Nova can handle Chinese
|
1554 |
"ko": "nova", # Nova can handle Korean
|
|
|
1557 |
|
1558 |
print(f"[INTERPRETATION] Using voice: {selected_voice} for language: {self.interpretation_language}")
|
1559 |
|
1560 |
+
# For some languages, we might need to add pronunciation hints
|
1561 |
+
if self.interpretation_language == "en" and re.search(r'[가-힣]', translated_text):
|
1562 |
+
print("[INTERPRETATION] ERROR: Korean characters in English translation!")
|
1563 |
+
translated_text = "Translation error occurred"
|
1564 |
+
|
1565 |
+
try:
|
1566 |
+
tts_response = await self.client.audio.speech.create(
|
1567 |
+
model="tts-1",
|
1568 |
+
voice=selected_voice,
|
1569 |
+
input=translated_text,
|
1570 |
+
response_format="pcm", # PCM format for direct playback
|
1571 |
+
speed=1.0
|
1572 |
+
)
|
1573 |
+
except Exception as tts_error:
|
1574 |
+
print(f"[INTERPRETATION] TTS Error: {tts_error}")
|
1575 |
+
# If TTS fails, try with a different voice
|
1576 |
+
tts_response = await self.client.audio.speech.create(
|
1577 |
+
model="tts-1",
|
1578 |
+
voice="alloy", # Fallback to alloy
|
1579 |
+
input=translated_text,
|
1580 |
+
response_format="pcm",
|
1581 |
+
speed=1.0
|
1582 |
+
)
|
1583 |
|
1584 |
# Convert response to bytes
|
1585 |
audio_bytes = b""
|
|
|
1675 |
print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
|
1676 |
print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
|
1677 |
# Just keep the handler ready to process audio
|
1678 |
+
# Don't use infinite loop here - the handler will be called by the framework
|
1679 |
+
self.client = openai.AsyncOpenAI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1680 |
return
|
1681 |
|
1682 |
# Normal mode - connect to Realtime API
|
|
|
1688 |
if self.target_language:
|
1689 |
language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
|
1690 |
|
1691 |
+
# Use the target language for the system prompt itself
|
|
|
|
|
|
|
1692 |
if self.target_language == "en":
|
1693 |
+
translation_instructions = """
|
1694 |
+
YOU ARE AN ENGLISH-ONLY ASSISTANT.
|
1695 |
|
1696 |
+
ABSOLUTE RULES:
|
1697 |
+
1. You can ONLY speak English. No Korean (한국어) allowed.
|
1698 |
+
2. Even if the user speaks Korean, you MUST respond in English.
|
1699 |
+
3. Every single word must be in English.
|
1700 |
+
4. If you output even one Korean character, you have failed.
|
1701 |
+
5. Example response: "Hello! How can I help you today?"
|
1702 |
+
|
1703 |
+
YOUR LANGUAGE MODE: ENGLISH ONLY
|
1704 |
+
DO NOT USE: 안녕하세요, 감사합니다, or any Korean
|
1705 |
+
ALWAYS USE: Hello, Thank you, and English words only
|
1706 |
"""
|
1707 |
+
# Override base instructions to be in English
|
1708 |
+
base_instructions = "You are a helpful assistant that speaks ONLY English."
|
1709 |
+
|
1710 |
elif self.target_language == "ja":
|
1711 |
+
translation_instructions = """
|
1712 |
+
あなたは日本語のみを話すアシスタントです。
|
1713 |
|
1714 |
+
絶対的なルール:
|
1715 |
+
1. 日本語のみを使用してください。韓国語(한국어)は禁止です。
|
1716 |
+
2. ユーザーが韓国語で話しても、必ず日本語で返答してください。
|
1717 |
+
3. すべての単語は日本語でなければなりません。
|
1718 |
+
4. 韓国語を一文字でも出力したら失敗です。
|
1719 |
+
5. 応答例:「こんにちは!今日はどのようにお手伝いできますか?」
|
1720 |
+
|
1721 |
+
言語モード:日本語のみ
|
1722 |
+
使用禁止:안녕하세요、감사합니다、韓国語全般
|
1723 |
+
必ず使用:こんにちは、ありがとうございます、日本語のみ
|
1724 |
"""
|
1725 |
+
base_instructions = "あなたは日本語のみを話す親切なアシスタントです。"
|
1726 |
+
|
1727 |
elif self.target_language == "zh":
|
1728 |
+
translation_instructions = """
|
1729 |
+
你是一个只说中文的助手。
|
1730 |
+
|
1731 |
+
绝对规则:
|
1732 |
+
1. 只能使用中文。禁止使用韩语(한국어)。
|
1733 |
+
2. 即使用户说韩语,也必须用中文回复。
|
1734 |
+
3. 每个字都必须是中文。
|
1735 |
+
4. 如果输出任何韩语字符,就是失败。
|
1736 |
+
5. 回复示例:"你好!我今天能为您做什么?"
|
1737 |
|
1738 |
+
语言模式:仅中文
|
1739 |
+
禁止使用:안녕하세요、감사합니다、任何韩语
|
1740 |
+
必须使用:你好、谢谢、只用中文
|
|
|
|
|
|
|
1741 |
"""
|
1742 |
+
base_instructions = "你是一个只说中文的友好助手。"
|
1743 |
+
|
1744 |
elif self.target_language == "es":
|
1745 |
+
translation_instructions = """
|
1746 |
+
ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
|
1747 |
+
|
1748 |
+
REGLAS ABSOLUTAS:
|
1749 |
+
1. Solo puedes hablar español. No se permite coreano (한국어).
|
1750 |
+
2. Incluso si el usuario habla coreano, DEBES responder en español.
|
1751 |
+
3. Cada palabra debe estar en español.
|
1752 |
+
4. Si produces aunque sea un carácter coreano, has fallado.
|
1753 |
+
5. Respuesta ejemplo: "¡Hola! ¿Cómo puedo ayudarte hoy?"
|
1754 |
|
1755 |
+
MODO DE IDIOMA: SOLO ESPAÑOL
|
1756 |
+
NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
|
1757 |
+
SIEMPRE USAR: Hola, Gracias, y solo palabras en español
|
|
|
|
|
|
|
1758 |
"""
|
1759 |
+
base_instructions = "Eres un asistente útil que habla SOLO español."
|
1760 |
else:
|
1761 |
translation_instructions = f"""
|
1762 |
+
YOU MUST ONLY SPEAK {language_name.upper()}.
|
1763 |
|
1764 |
RULES:
|
1765 |
+
1. Output only in {language_name}
|
1766 |
+
2. Never use Korean
|
1767 |
+
3. Always respond in {language_name}
|
1768 |
"""
|
1769 |
+
base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}."
|
1770 |
else:
|
1771 |
translation_instructions = ""
|
1772 |
|
|
|
1803 |
"When in doubt, USE web_search. It's better to search and provide accurate information "
|
1804 |
"than to guess or use outdated information."
|
1805 |
)
|
1806 |
+
|
1807 |
+
# Combine all instructions
|
1808 |
+
if translation_instructions:
|
1809 |
+
# Translation instructions already include base_instructions
|
1810 |
+
instructions = translation_instructions + search_instructions
|
1811 |
+
else:
|
1812 |
+
instructions = base_instructions + search_instructions
|
1813 |
else:
|
1814 |
+
# No web search
|
1815 |
+
if translation_instructions:
|
1816 |
+
instructions = translation_instructions
|
1817 |
+
else:
|
1818 |
+
instructions = base_instructions
|
1819 |
|
1820 |
+
print(f"[NORMAL MODE] Base instructions: {base_instructions[:100]}...")
|
1821 |
+
print(f"[NORMAL MODE] Translation instructions: {translation_instructions[:200] if translation_instructions else 'None'}...")
|
1822 |
+
print(f"[NORMAL MODE] Combined instructions length: {len(instructions)}")
|
1823 |
+
print(f"[NORMAL MODE] Target language: {self.target_language}")
|
1824 |
|
1825 |
async with self.client.beta.realtime.connect(
|
1826 |
model="gpt-4o-mini-realtime-preview-2024-12-17"
|
|
|
1830 |
"turn_detection": {"type": "server_vad"},
|
1831 |
"instructions": instructions,
|
1832 |
"tools": tools,
|
1833 |
+
"tool_choice": "auto" if tools else "none",
|
1834 |
+
"temperature": 0.7,
|
1835 |
+
"max_response_output_tokens": 4096,
|
1836 |
+
"modalities": ["text", "audio"],
|
1837 |
+
"voice": "alloy" # Default voice
|
1838 |
}
|
1839 |
|
1840 |
# Use appropriate voice for the language
|
1841 |
if self.target_language:
|
1842 |
+
# Force language through multiple mechanisms
|
1843 |
+
# 1. Use voice that's known to work well with the language
|
1844 |
voice_map = {
|
1845 |
+
"en": "nova", # Nova has clearer English
|
1846 |
+
"es": "nova", # Nova works for Spanish
|
1847 |
"fr": "shimmer", # Shimmer for French
|
1848 |
+
"de": "echo", # Echo for German
|
1849 |
+
"ja": "alloy", # Alloy can do Japanese
|
1850 |
+
"zh": "alloy", # Alloy can do Chinese
|
1851 |
+
"ko": "nova", # Nova for Korean
|
1852 |
}
|
1853 |
session_update["voice"] = voice_map.get(self.target_language, "nova")
|
1854 |
|
1855 |
+
# 2. Add language to modalities (experimental)
|
1856 |
session_update["modalities"] = ["text", "audio"]
|
|
|
1857 |
|
1858 |
+
# 3. Set output format
|
1859 |
+
session_update["output_audio_format"] = "pcm16"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1860 |
|
1861 |
+
# 4. Add language hint to the system (if supported by API)
|
1862 |
+
if self.target_language in ["en", "es", "fr", "de", "ja", "zh"]:
|
1863 |
+
session_update["language"] = self.target_language # Try setting language directly
|
1864 |
|
1865 |
+
print(f"[TRANSLATION MODE] Session update: {json.dumps(session_update, indent=2)}")
|
|
|
|
|
1866 |
|
1867 |
await conn.session.update(session=session_update)
|
1868 |
self.connection = conn
|