Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1465,152 +1465,155 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1465 |
if not user_text:
|
1466 |
return
|
1467 |
|
1468 |
-
# 2. Translate with GPT-4o-mini
|
1469 |
target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
|
1470 |
|
1471 |
-
#
|
1472 |
-
|
1473 |
-
"
|
1474 |
-
|
1475 |
-
|
1476 |
-
|
1477 |
-
}
|
1478 |
-
|
1479 |
-
|
1480 |
-
|
1481 |
-
|
1482 |
-
|
1483 |
-
"
|
1484 |
-
|
1485 |
-
|
1486 |
-
"오늘 날씨가 좋네요": "今天天气很好"
|
1487 |
-
},
|
1488 |
-
"es": {
|
1489 |
-
"안녕하세요": "Hola",
|
1490 |
-
"감사합니다": "Gracias",
|
1491 |
-
"오늘 날씨가 좋네요": "El clima está agradable hoy"
|
1492 |
-
}
|
1493 |
-
}
|
1494 |
-
|
1495 |
-
examples = translation_examples.get(self.interpretation_language, translation_examples["en"])
|
1496 |
-
examples_text = "\n".join([f'"{k}" → "{v}"' for k, v in examples.items()])
|
1497 |
-
|
1498 |
-
# Ultra-specific prompt
|
1499 |
-
system_prompt = f"""You are a Korean to {target_lang_name} translator.
|
1500 |
-
|
1501 |
-
STRICT RULES:
|
1502 |
-
1. Output ONLY the {target_lang_name} translation
|
1503 |
-
2. Do NOT output Korean
|
1504 |
-
3. Do NOT add explanations
|
1505 |
-
4. Do NOT answer questions
|
1506 |
-
5. Just translate
|
1507 |
-
|
1508 |
-
Examples:
|
1509 |
-
{examples_text}
|
1510 |
-
|
1511 |
-
Now translate the Korean text to {target_lang_name}. Output ONLY the translation in {target_lang_name}:"""
|
1512 |
|
1513 |
-
print(f"[INTERPRETATION]
|
1514 |
-
print(f"[INTERPRETATION] System prompt: {system_prompt}")
|
1515 |
|
|
|
1516 |
translation_response = await self.client.chat.completions.create(
|
1517 |
model="gpt-4o-mini",
|
1518 |
messages=[
|
1519 |
-
{
|
1520 |
-
"role": "system",
|
1521 |
-
"content": system_prompt
|
1522 |
-
},
|
1523 |
{
|
1524 |
"role": "user",
|
1525 |
-
"content":
|
1526 |
}
|
1527 |
],
|
1528 |
-
temperature=0.
|
1529 |
max_tokens=200
|
1530 |
)
|
1531 |
|
1532 |
translated_text = translation_response.choices[0].message.content.strip()
|
1533 |
|
1534 |
-
#
|
1535 |
import re
|
1536 |
-
if re.search(r'[가-힣]', translated_text):
|
1537 |
-
print(f"[INTERPRETATION] WARNING: Korean
|
1538 |
-
|
1539 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1540 |
|
1541 |
-
print(f"[INTERPRETATION]
|
1542 |
|
1543 |
# 3. Generate speech with TTS
|
1544 |
-
|
1545 |
-
|
1546 |
-
# Select appropriate voice and ensure it speaks the target language
|
1547 |
voice_map = {
|
1548 |
-
"en": "
|
1549 |
"es": "nova", # Nova handles Spanish well
|
1550 |
-
"fr": "shimmer", # Shimmer
|
1551 |
-
"de": "echo", # Echo
|
1552 |
-
"ja": "
|
1553 |
-
"zh": "
|
1554 |
-
"ko": "nova", # Nova
|
|
|
|
|
|
|
1555 |
}
|
1556 |
selected_voice = voice_map.get(self.interpretation_language, "nova")
|
1557 |
|
1558 |
-
print(f"[INTERPRETATION]
|
1559 |
-
|
1560 |
-
# For some languages, we might need to add pronunciation hints
|
1561 |
-
if self.interpretation_language == "en" and re.search(r'[가-힣]', translated_text):
|
1562 |
-
print("[INTERPRETATION] ERROR: Korean characters in English translation!")
|
1563 |
-
translated_text = "Translation error occurred"
|
1564 |
|
1565 |
try:
|
1566 |
tts_response = await self.client.audio.speech.create(
|
1567 |
model="tts-1",
|
1568 |
voice=selected_voice,
|
1569 |
input=translated_text,
|
1570 |
-
response_format="pcm",
|
1571 |
speed=1.0
|
1572 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1573 |
except Exception as tts_error:
|
1574 |
print(f"[INTERPRETATION] TTS Error: {tts_error}")
|
1575 |
-
#
|
1576 |
-
|
1577 |
-
|
1578 |
-
|
1579 |
-
|
1580 |
-
|
1581 |
-
|
1582 |
-
|
1583 |
-
|
1584 |
-
|
1585 |
-
audio_bytes = b""
|
1586 |
-
async for chunk in tts_response.iter_bytes(1024):
|
1587 |
-
audio_bytes += chunk
|
1588 |
-
|
1589 |
-
# Convert PCM to numpy array (TTS outputs at 24kHz)
|
1590 |
-
audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
|
1591 |
-
|
1592 |
-
# Send audio in chunks
|
1593 |
-
if len(audio_array) > 0:
|
1594 |
-
# Split audio into chunks and send
|
1595 |
-
chunk_size = 480 # Match our frame size
|
1596 |
-
for i in range(0, len(audio_array), chunk_size):
|
1597 |
-
chunk = audio_array[i:i + chunk_size]
|
1598 |
-
if len(chunk) < chunk_size:
|
1599 |
-
# Pad the last chunk if necessary
|
1600 |
-
chunk = np.pad(chunk, (0, chunk_size - len(chunk)), 'constant')
|
1601 |
-
|
1602 |
-
await self.output_queue.put((SAMPLE_RATE, chunk.reshape(1, -1)))
|
1603 |
-
|
1604 |
-
# Send transcript event
|
1605 |
-
output_data = {
|
1606 |
-
"event": type('Event', (), {
|
1607 |
-
'transcript': f"{user_text} → {translated_text}"
|
1608 |
-
})(),
|
1609 |
-
"language": target_lang_name,
|
1610 |
-
"mode": "interpretation"
|
1611 |
-
}
|
1612 |
-
await self.output_queue.put(AdditionalOutputs(output_data))
|
1613 |
-
|
1614 |
except Exception as e:
|
1615 |
print(f"[INTERPRETATION] Error: {e}")
|
1616 |
import traceback
|
@@ -1630,6 +1633,8 @@ Now translate the Korean text to {target_lang_name}. Output ONLY the translation
|
|
1630 |
self.audio_buffer = []
|
1631 |
self.is_recording = False
|
1632 |
self.silence_frames = 0
|
|
|
|
|
1633 |
|
1634 |
def get_translation_instructions(self):
|
1635 |
"""Get instructions for translation based on target language"""
|
@@ -1672,7 +1677,7 @@ Now translate the Korean text to {target_lang_name}. Output ONLY the translation
|
|
1672 |
|
1673 |
# If in interpretation mode, don't connect to Realtime API
|
1674 |
if self.interpretation_mode:
|
1675 |
-
print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-
|
1676 |
print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
|
1677 |
# Just keep the handler ready to process audio
|
1678 |
# Don't use infinite loop here - the handler will be called by the framework
|
@@ -1823,7 +1828,7 @@ RULES:
|
|
1823 |
print(f"[NORMAL MODE] Target language: {self.target_language}")
|
1824 |
|
1825 |
async with self.client.beta.realtime.connect(
|
1826 |
-
model="gpt-
|
1827 |
) as conn:
|
1828 |
# Update session with tools
|
1829 |
session_update = {
|
@@ -2080,7 +2085,7 @@ async def custom_offer(request: Request):
|
|
2080 |
|
2081 |
@app.post("/chat/text")
|
2082 |
async def chat_text(request: Request):
|
2083 |
-
"""Handle text chat messages using GPT-
|
2084 |
try:
|
2085 |
body = await request.json()
|
2086 |
message = body.get("message", "")
|
|
|
1465 |
if not user_text:
|
1466 |
return
|
1467 |
|
1468 |
+
# 2. Translate with GPT-4o-mini - FIXED VERSION
|
1469 |
target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
|
1470 |
|
1471 |
+
# More direct translation approach
|
1472 |
+
if self.interpretation_language == "en":
|
1473 |
+
translation_prompt = f"Translate this Korean text to English. Output ONLY the English translation, nothing else: {user_text}"
|
1474 |
+
elif self.interpretation_language == "ja":
|
1475 |
+
translation_prompt = f"韓国語を日本語に翻訳してください。日本語の翻訳のみを出力してください: {user_text}"
|
1476 |
+
elif self.interpretation_language == "zh":
|
1477 |
+
translation_prompt = f"将韩语翻译成中文。只输出中文翻译: {user_text}"
|
1478 |
+
elif self.interpretation_language == "es":
|
1479 |
+
translation_prompt = f"Traduce este texto coreano al español. Solo muestra la traducción en español: {user_text}"
|
1480 |
+
elif self.interpretation_language == "fr":
|
1481 |
+
translation_prompt = f"Traduisez ce texte coréen en français. Affichez uniquement la traduction française: {user_text}"
|
1482 |
+
elif self.interpretation_language == "de":
|
1483 |
+
translation_prompt = f"Übersetzen Sie diesen koreanischen Text ins Deutsche. Geben Sie nur die deutsche Übersetzung aus: {user_text}"
|
1484 |
+
else:
|
1485 |
+
translation_prompt = f"Translate Korean to {target_lang_name}. Output only {target_lang_name}: {user_text}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1486 |
|
1487 |
+
print(f"[INTERPRETATION] Translation prompt: {translation_prompt}")
|
|
|
1488 |
|
1489 |
+
# Use a single user message approach for better results
|
1490 |
translation_response = await self.client.chat.completions.create(
|
1491 |
model="gpt-4o-mini",
|
1492 |
messages=[
|
|
|
|
|
|
|
|
|
1493 |
{
|
1494 |
"role": "user",
|
1495 |
+
"content": translation_prompt
|
1496 |
}
|
1497 |
],
|
1498 |
+
temperature=0.0, # Set to 0 for most deterministic output
|
1499 |
max_tokens=200
|
1500 |
)
|
1501 |
|
1502 |
translated_text = translation_response.choices[0].message.content.strip()
|
1503 |
|
1504 |
+
# Validation: Check if Korean characters are present in non-Korean translations
|
1505 |
import re
|
1506 |
+
if self.interpretation_language != "ko" and re.search(r'[가-힣]', translated_text):
|
1507 |
+
print(f"[INTERPRETATION] WARNING: Korean detected in {self.interpretation_language} translation")
|
1508 |
+
|
1509 |
+
# Try again with a more forceful prompt
|
1510 |
+
force_prompt = {
|
1511 |
+
"en": f"English only: {user_text}",
|
1512 |
+
"ja": f"日本語のみ: {user_text}",
|
1513 |
+
"zh": f"仅中文: {user_text}",
|
1514 |
+
"es": f"Solo español: {user_text}",
|
1515 |
+
"fr": f"Français seulement: {user_text}",
|
1516 |
+
"de": f"Nur Deutsch: {user_text}"
|
1517 |
+
}.get(self.interpretation_language, f"{target_lang_name} only: {user_text}")
|
1518 |
+
|
1519 |
+
retry_response = await self.client.chat.completions.create(
|
1520 |
+
model="gpt-4o-mini",
|
1521 |
+
messages=[{"role": "user", "content": force_prompt}],
|
1522 |
+
temperature=0.0,
|
1523 |
+
max_tokens=200
|
1524 |
+
)
|
1525 |
+
|
1526 |
+
new_translation = retry_response.choices[0].message.content.strip()
|
1527 |
+
|
1528 |
+
# If still has Korean, extract non-Korean parts
|
1529 |
+
if re.search(r'[가-힣]', new_translation):
|
1530 |
+
# Remove all Korean characters and clean up
|
1531 |
+
cleaned = re.sub(r'[가-힣]+', ' ', new_translation).strip()
|
1532 |
+
cleaned = re.sub(r'\s+', ' ', cleaned) # Remove multiple spaces
|
1533 |
+
if cleaned and len(cleaned) > 3: # If we have meaningful content left
|
1534 |
+
translated_text = cleaned
|
1535 |
+
else:
|
1536 |
+
# Fallback to a simple translation
|
1537 |
+
translated_text = {
|
1538 |
+
"en": "Translation completed",
|
1539 |
+
"ja": "翻訳完了",
|
1540 |
+
"zh": "翻译完成",
|
1541 |
+
"es": "Traducción completada",
|
1542 |
+
"fr": "Traduction terminée",
|
1543 |
+
"de": "Übersetzung abgeschlossen"
|
1544 |
+
}.get(self.interpretation_language, "Translation completed")
|
1545 |
+
else:
|
1546 |
+
translated_text = new_translation
|
1547 |
|
1548 |
+
print(f"[INTERPRETATION] Final translated text: {translated_text}")
|
1549 |
|
1550 |
# 3. Generate speech with TTS
|
1551 |
+
# Select voice optimized for the target language
|
|
|
|
|
1552 |
voice_map = {
|
1553 |
+
"en": "nova", # Nova has clear English pronunciation
|
1554 |
"es": "nova", # Nova handles Spanish well
|
1555 |
+
"fr": "shimmer", # Shimmer for French
|
1556 |
+
"de": "echo", # Echo for German
|
1557 |
+
"ja": "alloy", # Alloy can handle Japanese
|
1558 |
+
"zh": "alloy", # Alloy can handle Chinese
|
1559 |
+
"ko": "nova", # Nova for Korean
|
1560 |
+
"it": "nova", # Nova for Italian
|
1561 |
+
"pt": "shimmer", # Shimmer for Portuguese
|
1562 |
+
"ru": "onyx", # Onyx for Russian
|
1563 |
}
|
1564 |
selected_voice = voice_map.get(self.interpretation_language, "nova")
|
1565 |
|
1566 |
+
print(f"[INTERPRETATION] Generating TTS with voice: {selected_voice}")
|
|
|
|
|
|
|
|
|
|
|
1567 |
|
1568 |
try:
|
1569 |
tts_response = await self.client.audio.speech.create(
|
1570 |
model="tts-1",
|
1571 |
voice=selected_voice,
|
1572 |
input=translated_text,
|
1573 |
+
response_format="pcm",
|
1574 |
speed=1.0
|
1575 |
)
|
1576 |
+
|
1577 |
+
# Convert response to bytes
|
1578 |
+
audio_bytes = b""
|
1579 |
+
async for chunk in tts_response.iter_bytes(1024):
|
1580 |
+
audio_bytes += chunk
|
1581 |
+
|
1582 |
+
# Convert PCM to numpy array
|
1583 |
+
audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
|
1584 |
+
|
1585 |
+
# Send audio in chunks
|
1586 |
+
if len(audio_array) > 0:
|
1587 |
+
chunk_size = 480
|
1588 |
+
for i in range(0, len(audio_array), chunk_size):
|
1589 |
+
chunk = audio_array[i:i + chunk_size]
|
1590 |
+
if len(chunk) < chunk_size:
|
1591 |
+
chunk = np.pad(chunk, (0, chunk_size - len(chunk)), 'constant')
|
1592 |
+
|
1593 |
+
await self.output_queue.put((SAMPLE_RATE, chunk.reshape(1, -1)))
|
1594 |
+
|
1595 |
+
# Send transcript event - show both original and translation
|
1596 |
+
output_data = {
|
1597 |
+
"event": type('Event', (), {
|
1598 |
+
'transcript': f"{user_text} → {translated_text}"
|
1599 |
+
})(),
|
1600 |
+
"language": target_lang_name,
|
1601 |
+
"mode": "interpretation"
|
1602 |
+
}
|
1603 |
+
await self.output_queue.put(AdditionalOutputs(output_data))
|
1604 |
+
|
1605 |
except Exception as tts_error:
|
1606 |
print(f"[INTERPRETATION] TTS Error: {tts_error}")
|
1607 |
+
# Send error message
|
1608 |
+
error_data = {
|
1609 |
+
"event": type('Event', (), {
|
1610 |
+
'transcript': f"TTS 오류: {str(tts_error)}"
|
1611 |
+
})(),
|
1612 |
+
"language": "",
|
1613 |
+
"mode": "error"
|
1614 |
+
}
|
1615 |
+
await self.output_queue.put(AdditionalOutputs(error_data))
|
1616 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1617 |
except Exception as e:
|
1618 |
print(f"[INTERPRETATION] Error: {e}")
|
1619 |
import traceback
|
|
|
1633 |
self.audio_buffer = []
|
1634 |
self.is_recording = False
|
1635 |
self.silence_frames = 0
|
1636 |
+
|
1637 |
+
|
1638 |
|
1639 |
def get_translation_instructions(self):
|
1640 |
"""Get instructions for translation based on target language"""
|
|
|
1677 |
|
1678 |
# If in interpretation mode, don't connect to Realtime API
|
1679 |
if self.interpretation_mode:
|
1680 |
+
print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4.1-mini + TTS")
|
1681 |
print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
|
1682 |
# Just keep the handler ready to process audio
|
1683 |
# Don't use infinite loop here - the handler will be called by the framework
|
|
|
1828 |
print(f"[NORMAL MODE] Target language: {self.target_language}")
|
1829 |
|
1830 |
async with self.client.beta.realtime.connect(
|
1831 |
+
model="gpt-4.0-mini-realtime-preview-2024-12-17"
|
1832 |
) as conn:
|
1833 |
# Update session with tools
|
1834 |
session_update = {
|
|
|
2085 |
|
2086 |
@app.post("/chat/text")
|
2087 |
async def chat_text(request: Request):
|
2088 |
+
"""Handle text chat messages using GPT-4.1-mini"""
|
2089 |
try:
|
2090 |
body = await request.json()
|
2091 |
message = body.get("message", "")
|