Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -719,6 +719,15 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
719 |
sendButton.style.display = 'block';
|
720 |
|
721 |
console.log('Interpretation mode disabled');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
722 |
}
|
723 |
|
724 |
console.log('Interpretation mode:', interpretationMode);
|
@@ -747,6 +756,15 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
747 |
sendButton.style.display = 'none';
|
748 |
|
749 |
console.log('Interpretation mode enabled with language:', interpretationLanguage);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
750 |
}
|
751 |
});
|
752 |
|
@@ -979,6 +997,17 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
979 |
updateButtonState();
|
980 |
});
|
981 |
webrtc_id = Math.random().toString(36).substring(7);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
982 |
const response = await fetch('/webrtc/offer', {
|
983 |
method: 'POST',
|
984 |
headers: { 'Content-Type': 'application/json' },
|
@@ -1006,8 +1035,21 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
1006 |
eventSource.addEventListener("output", (event) => {
|
1007 |
const eventJson = JSON.parse(event.data);
|
1008 |
let content = eventJson.content;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1009 |
if (selectedLanguage && eventJson.language) {
|
1010 |
content += ` <span class="language-info">[${eventJson.language}]</span>`;
|
|
|
|
|
1011 |
}
|
1012 |
addMessage("assistant", content);
|
1013 |
});
|
@@ -1298,30 +1340,22 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1298 |
target_language_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
|
1299 |
target_code = self.interpretation_language
|
1300 |
|
1301 |
-
# Language-specific examples
|
1302 |
-
examples = {
|
1303 |
-
"en": "Hello, the weather is nice today",
|
1304 |
-
"es": "Hola, el clima está agradable hoy",
|
1305 |
-
"fr": "Bonjour, il fait beau aujourd'hui",
|
1306 |
-
"de": "Hallo, das Wetter ist heute schön",
|
1307 |
-
"ja": "こんにちは、今日はいい天気ですね",
|
1308 |
-
"zh": "你好,今天天气很好"
|
1309 |
-
}
|
1310 |
-
|
1311 |
-
example_translation = examples.get(target_code, "Hello, the weather is nice today")
|
1312 |
-
|
1313 |
return (
|
1314 |
-
f"
|
1315 |
-
f"
|
1316 |
-
f"
|
1317 |
-
f"
|
1318 |
-
f"
|
1319 |
-
f"
|
1320 |
-
f"
|
1321 |
-
f"
|
1322 |
-
f"
|
1323 |
-
f"
|
1324 |
-
f"
|
|
|
|
|
|
|
|
|
1325 |
)
|
1326 |
|
1327 |
def get_translation_instructions(self):
|
@@ -1368,18 +1402,18 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1368 |
|
1369 |
# Check if in interpretation mode
|
1370 |
if self.interpretation_mode:
|
1371 |
-
# In interpretation mode, override
|
1372 |
-
|
1373 |
-
f"
|
1374 |
-
f"
|
1375 |
-
f"
|
1376 |
-
f"
|
|
|
1377 |
)
|
1378 |
-
interpretation_instructions = self.get_interpretation_instructions()
|
1379 |
-
instructions = base_instructions + "\n\n" + interpretation_instructions
|
1380 |
# No tools in interpretation mode
|
1381 |
tools = []
|
1382 |
print(f"Interpretation mode active - target language: {self.interpretation_language}")
|
|
|
1383 |
else:
|
1384 |
# Normal mode - add translation instructions if language is selected
|
1385 |
translation_instructions = self.get_translation_instructions()
|
@@ -1430,39 +1464,77 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1430 |
"type": "server_vad",
|
1431 |
"threshold": 0.5,
|
1432 |
"prefix_padding_ms": 300,
|
1433 |
-
"silence_duration_ms":
|
1434 |
},
|
1435 |
"instructions": instructions,
|
1436 |
"tools": tools,
|
1437 |
-
"tool_choice": "auto" if tools else "none"
|
|
|
|
|
1438 |
}
|
1439 |
|
1440 |
# Add voice setting based on interpretation or translation language
|
1441 |
voice_language = self.interpretation_language if self.interpretation_mode else self.target_language
|
1442 |
-
|
1443 |
-
|
1444 |
-
|
|
|
1445 |
session_update["voice"] = "alloy"
|
|
|
1446 |
|
1447 |
-
#
|
1448 |
-
|
1449 |
-
|
1450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1451 |
print(f"Voice set to: alloy for language: {voice_language}")
|
1452 |
|
1453 |
-
# For interpretation mode, ensure proper language settings
|
1454 |
-
if self.interpretation_mode and self.interpretation_language:
|
1455 |
-
session_update["modalities"] = ["text", "audio"]
|
1456 |
-
session_update["temperature"] = 0.3 # Lower temperature for more accurate translation
|
1457 |
-
session_update["max_response_output_tokens"] = 500 # Limit output to prevent long generations
|
1458 |
-
print(f"Interpretation session config: voice={session_update.get('voice')}, lang={self.interpretation_language}")
|
1459 |
-
|
1460 |
await conn.session.update(session=session_update)
|
1461 |
self.connection = conn
|
1462 |
print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}, "
|
1463 |
f"interpretation_mode: {self.interpretation_mode}, language: {self.interpretation_language if self.interpretation_mode else self.target_language}")
|
1464 |
|
1465 |
async for event in self.connection:
|
|
|
|
|
|
|
|
|
|
|
1466 |
# Debug logging for function calls
|
1467 |
if event.type.startswith("response.function_call"):
|
1468 |
print(f"Function event: {event.type}")
|
@@ -1473,7 +1545,8 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1473 |
"language": SUPPORTED_LANGUAGES.get(
|
1474 |
self.interpretation_language if self.interpretation_mode else self.target_language,
|
1475 |
""
|
1476 |
-
) if (self.interpretation_language or self.target_language) else ""
|
|
|
1477 |
}
|
1478 |
await self.output_queue.put(AdditionalOutputs(output_data))
|
1479 |
|
@@ -1687,7 +1760,8 @@ async def outputs(webrtc_id: str):
|
|
1687 |
data = {
|
1688 |
"role": "assistant",
|
1689 |
"content": event.transcript,
|
1690 |
-
"language": output.args[0].get('language', '')
|
|
|
1691 |
}
|
1692 |
yield f"event: output\ndata: {json.dumps(data)}\n\n"
|
1693 |
|
|
|
719 |
sendButton.style.display = 'block';
|
720 |
|
721 |
console.log('Interpretation mode disabled');
|
722 |
+
|
723 |
+
// If connected, restart to apply normal mode
|
724 |
+
if (peerConnection && peerConnection.connectionState === 'connected') {
|
725 |
+
showError('일반 모드로 전환하기 위해 연결을 다시 시작합니다.');
|
726 |
+
stop();
|
727 |
+
setTimeout(() => {
|
728 |
+
setupWebRTC();
|
729 |
+
}, 500);
|
730 |
+
}
|
731 |
}
|
732 |
|
733 |
console.log('Interpretation mode:', interpretationMode);
|
|
|
756 |
sendButton.style.display = 'none';
|
757 |
|
758 |
console.log('Interpretation mode enabled with language:', interpretationLanguage);
|
759 |
+
|
760 |
+
// If already connected, restart the connection with new settings
|
761 |
+
if (peerConnection && peerConnection.connectionState === 'connected') {
|
762 |
+
showError('통역 모드 설정을 적용하기 위해 연결을 다시 시작합니다.');
|
763 |
+
stop();
|
764 |
+
setTimeout(() => {
|
765 |
+
setupWebRTC();
|
766 |
+
}, 500);
|
767 |
+
}
|
768 |
}
|
769 |
});
|
770 |
|
|
|
997 |
updateButtonState();
|
998 |
});
|
999 |
webrtc_id = Math.random().toString(36).substring(7);
|
1000 |
+
|
1001 |
+
// Log current settings before sending
|
1002 |
+
console.log('Sending offer with settings:', {
|
1003 |
+
webrtc_id: webrtc_id,
|
1004 |
+
web_search_enabled: webSearchEnabled,
|
1005 |
+
target_language: selectedLanguage,
|
1006 |
+
system_prompt: systemPrompt,
|
1007 |
+
interpretation_mode: interpretationMode,
|
1008 |
+
interpretation_language: interpretationLanguage
|
1009 |
+
});
|
1010 |
+
|
1011 |
const response = await fetch('/webrtc/offer', {
|
1012 |
method: 'POST',
|
1013 |
headers: { 'Content-Type': 'application/json' },
|
|
|
1035 |
eventSource.addEventListener("output", (event) => {
|
1036 |
const eventJson = JSON.parse(event.data);
|
1037 |
let content = eventJson.content;
|
1038 |
+
|
1039 |
+
// Debug logging for interpretation mode
|
1040 |
+
if (interpretationMode) {
|
1041 |
+
console.log('[INTERPRETATION OUTPUT]', {
|
1042 |
+
content: content,
|
1043 |
+
language: eventJson.language,
|
1044 |
+
mode: eventJson.mode,
|
1045 |
+
expectedLanguage: interpretationLanguage
|
1046 |
+
});
|
1047 |
+
}
|
1048 |
+
|
1049 |
if (selectedLanguage && eventJson.language) {
|
1050 |
content += ` <span class="language-info">[${eventJson.language}]</span>`;
|
1051 |
+
} else if (interpretationMode && eventJson.language) {
|
1052 |
+
content += ` <span class="language-info">[통역: ${eventJson.language}]</span>`;
|
1053 |
}
|
1054 |
addMessage("assistant", content);
|
1055 |
});
|
|
|
1340 |
target_language_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
|
1341 |
target_code = self.interpretation_language
|
1342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1343 |
return (
|
1344 |
+
f"YOU ARE IN TRANSLATION MODE. YOUR ONLY FUNCTION IS TO TRANSLATE.\n\n"
|
1345 |
+
f"RULES:\n"
|
1346 |
+
f"1. TRANSLATE the input to {target_language_name} ({target_code})\n"
|
1347 |
+
f"2. OUTPUT ONLY THE TRANSLATION\n"
|
1348 |
+
f"3. DO NOT ANSWER QUESTIONS\n"
|
1349 |
+
f"4. DO NOT PROVIDE INFORMATION\n"
|
1350 |
+
f"5. DO NOT CONTINUE SPEAKING\n"
|
1351 |
+
f"6. NEVER OUTPUT IN ANY OTHER LANGUAGE THAN {target_language_name}\n\n"
|
1352 |
+
f"EXAMPLES:\n"
|
1353 |
+
f"Input: '안녕하세요' → Output: 'Hello' (if English)\n"
|
1354 |
+
f"Input: '날씨가 어때요?' → Output: 'How is the weather?' (if English)\n"
|
1355 |
+
f"Input: '커피 한 잔 주세요' → Output: 'One coffee please' (if English)\n\n"
|
1356 |
+
f"YOU ARE NOT AN AI ASSISTANT. YOU ARE A TRANSLATOR.\n"
|
1357 |
+
f"TRANSLATE AND STOP. NO ADDITIONAL WORDS.\n"
|
1358 |
+
f"OUTPUT LANGUAGE: {target_language_name} ONLY."
|
1359 |
)
|
1360 |
|
1361 |
def get_translation_instructions(self):
|
|
|
1402 |
|
1403 |
# Check if in interpretation mode
|
1404 |
if self.interpretation_mode:
|
1405 |
+
# In interpretation mode, completely override everything
|
1406 |
+
instructions = (
|
1407 |
+
f"CRITICAL: YOU ARE A TRANSLATION SERVICE.\n"
|
1408 |
+
f"TRANSLATE USER INPUT TO {SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)}.\n"
|
1409 |
+
f"OUTPUT ONLY THE TRANSLATION. NOTHING ELSE.\n"
|
1410 |
+
f"DO NOT ACT AS AN AI. DO NOT ANSWER QUESTIONS.\n"
|
1411 |
+
f"JUST TRANSLATE AND STOP."
|
1412 |
)
|
|
|
|
|
1413 |
# No tools in interpretation mode
|
1414 |
tools = []
|
1415 |
print(f"Interpretation mode active - target language: {self.interpretation_language}")
|
1416 |
+
print(f"Instructions: {instructions}")
|
1417 |
else:
|
1418 |
# Normal mode - add translation instructions if language is selected
|
1419 |
translation_instructions = self.get_translation_instructions()
|
|
|
1464 |
"type": "server_vad",
|
1465 |
"threshold": 0.5,
|
1466 |
"prefix_padding_ms": 300,
|
1467 |
+
"silence_duration_ms": 100 if self.interpretation_mode else 700 # Even shorter silence
|
1468 |
},
|
1469 |
"instructions": instructions,
|
1470 |
"tools": tools,
|
1471 |
+
"tool_choice": "none" if self.interpretation_mode else ("auto" if tools else "none"),
|
1472 |
+
"temperature": 0.0 if self.interpretation_mode else 0.7,
|
1473 |
+
"max_response_output_tokens": 50 if self.interpretation_mode else 4096 # Very short responses
|
1474 |
}
|
1475 |
|
1476 |
# Add voice setting based on interpretation or translation language
|
1477 |
voice_language = self.interpretation_language if self.interpretation_mode else self.target_language
|
1478 |
+
|
1479 |
+
# For interpretation mode, ensure proper settings
|
1480 |
+
if self.interpretation_mode and self.interpretation_language:
|
1481 |
+
# Force simple translation behavior
|
1482 |
session_update["voice"] = "alloy"
|
1483 |
+
session_update["modalities"] = ["audio", "text"] # Changed order
|
1484 |
|
1485 |
+
# Create a very specific prompt for the target language
|
1486 |
+
target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
|
1487 |
+
|
1488 |
+
# Use very explicit language instructions
|
1489 |
+
if self.interpretation_language == "en":
|
1490 |
+
lang_specific = "Respond in English only. 영어로만 답하세요."
|
1491 |
+
elif self.interpretation_language == "es":
|
1492 |
+
lang_specific = "Respond in Spanish only. Solo responde en español."
|
1493 |
+
elif self.interpretation_language == "fr":
|
1494 |
+
lang_specific = "Respond in French only. Répondez uniquement en français."
|
1495 |
+
elif self.interpretation_language == "de":
|
1496 |
+
lang_specific = "Respond in German only. Antworten Sie nur auf Deutsch."
|
1497 |
+
elif self.interpretation_language == "ja":
|
1498 |
+
lang_specific = "Respond in Japanese only. 日本語でのみ答えてください。"
|
1499 |
+
elif self.interpretation_language == "zh":
|
1500 |
+
lang_specific = "Respond in Chinese only. 只用中文回答。"
|
1501 |
+
else:
|
1502 |
+
lang_specific = f"Respond in {target_lang_name} only."
|
1503 |
+
|
1504 |
+
# Override instructions with ultra-specific directive
|
1505 |
+
session_update["instructions"] = (
|
1506 |
+
f"TRANSLATE TO {target_lang_name.upper()}. "
|
1507 |
+
f"{lang_specific} "
|
1508 |
+
f"You are a translation machine. "
|
1509 |
+
f"Input → Translation. Nothing else. "
|
1510 |
+
f"Do not chat. Do not explain. Just translate. "
|
1511 |
+
f"Maximum 20 words per response."
|
1512 |
+
)
|
1513 |
+
|
1514 |
+
# Additional session parameters for interpretation mode
|
1515 |
+
session_update["input_audio_transcription"] = {
|
1516 |
+
"model": "whisper-1"
|
1517 |
+
}
|
1518 |
+
|
1519 |
+
print(f"[INTERPRETATION MODE] Target: {self.interpretation_language} ({target_lang_name})")
|
1520 |
+
print(f"[INTERPRETATION MODE] Instructions: {session_update['instructions']}")
|
1521 |
+
print(f"[INTERPRETATION MODE] Session config: {session_update}")
|
1522 |
+
elif voice_language:
|
1523 |
+
# Normal translation mode
|
1524 |
+
session_update["voice"] = "alloy"
|
1525 |
print(f"Voice set to: alloy for language: {voice_language}")
|
1526 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1527 |
await conn.session.update(session=session_update)
|
1528 |
self.connection = conn
|
1529 |
print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}, "
|
1530 |
f"interpretation_mode: {self.interpretation_mode}, language: {self.interpretation_language if self.interpretation_mode else self.target_language}")
|
1531 |
|
1532 |
async for event in self.connection:
|
1533 |
+
# Debug logging for interpretation mode
|
1534 |
+
if self.interpretation_mode and event.type == "response.audio_transcript.done":
|
1535 |
+
print(f"[INTERPRETATION] Transcript: {event.transcript[:100]}...")
|
1536 |
+
print(f"[INTERPRETATION] Expected language: {self.interpretation_language}")
|
1537 |
+
|
1538 |
# Debug logging for function calls
|
1539 |
if event.type.startswith("response.function_call"):
|
1540 |
print(f"Function event: {event.type}")
|
|
|
1545 |
"language": SUPPORTED_LANGUAGES.get(
|
1546 |
self.interpretation_language if self.interpretation_mode else self.target_language,
|
1547 |
""
|
1548 |
+
) if (self.interpretation_language or self.target_language) else "",
|
1549 |
+
"mode": "interpretation" if self.interpretation_mode else "normal"
|
1550 |
}
|
1551 |
await self.output_queue.put(AdditionalOutputs(output_data))
|
1552 |
|
|
|
1760 |
data = {
|
1761 |
"role": "assistant",
|
1762 |
"content": event.transcript,
|
1763 |
+
"language": output.args[0].get('language', ''),
|
1764 |
+
"mode": output.args[0].get('mode', 'normal')
|
1765 |
}
|
1766 |
yield f"event: output\ndata: {json.dumps(data)}\n\n"
|
1767 |
|