Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -20,6 +20,9 @@ from openai.types.beta.realtime import ResponseAudioTranscriptDoneEvent
|
|
20 |
import httpx
|
21 |
from typing import Optional, List, Dict
|
22 |
import gradio as gr
|
|
|
|
|
|
|
23 |
|
24 |
load_dotenv()
|
25 |
|
@@ -315,6 +318,15 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
315 |
padding: 10px;
|
316 |
margin-bottom: 10px;
|
317 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
.controls {
|
319 |
text-align: center;
|
320 |
margin-top: auto;
|
@@ -618,7 +630,10 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
618 |
</div>
|
619 |
</div>
|
620 |
<div class="interpretation-info" id="interpretation-info" style="display: none;">
|
621 |
-
|
|
|
|
|
|
|
622 |
</div>
|
623 |
<div class="text-input-section">
|
624 |
<label for="system-prompt" class="setting-label">์์คํ
ํ๋กฌํํธ:</label>
|
@@ -1049,6 +1064,16 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
1049 |
if (selectedLanguage && eventJson.language) {
|
1050 |
content += ` <span class="language-info">[${eventJson.language}]</span>`;
|
1051 |
} else if (interpretationMode && eventJson.language) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1052 |
content += ` <span class="language-info">[ํต์ญ: ${eventJson.language}]</span>`;
|
1053 |
}
|
1054 |
addMessage("assistant", content);
|
@@ -1069,6 +1094,12 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
1069 |
function addMessage(role, content) {
|
1070 |
const messageDiv = document.createElement('div');
|
1071 |
messageDiv.classList.add('message', role);
|
|
|
|
|
|
|
|
|
|
|
|
|
1072 |
if (content.includes('<span')) {
|
1073 |
messageDiv.innerHTML = content;
|
1074 |
} else {
|
@@ -1273,6 +1304,13 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1273 |
self.system_prompt = system_prompt
|
1274 |
self.interpretation_mode = interpretation_mode
|
1275 |
self.interpretation_language = interpretation_language
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1276 |
print(f"Handler created with web_search_enabled={web_search_enabled}, "
|
1277 |
f"target_language={target_language}, webrtc_id={webrtc_id}, "
|
1278 |
f"interpretation_mode={interpretation_mode}, interpretation_language={interpretation_language}")
|
@@ -1297,7 +1335,7 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1297 |
)
|
1298 |
|
1299 |
print(f"Handler.copy() called - creating new handler with default settings")
|
1300 |
-
return OpenAIHandler(web_search_enabled=False)
|
1301 |
|
1302 |
async def search_web(self, query: str) -> str:
|
1303 |
"""Perform web search and return formatted results"""
|
@@ -1332,31 +1370,122 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1332 |
)
|
1333 |
await self.connection.response.create()
|
1334 |
|
1335 |
-
def
|
1336 |
-
"""
|
1337 |
-
if not self.
|
1338 |
-
return
|
1339 |
-
|
1340 |
-
target_language_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
|
1341 |
-
target_code = self.interpretation_language
|
1342 |
|
1343 |
-
|
1344 |
-
f"
|
1345 |
-
|
1346 |
-
|
1347 |
-
|
1348 |
-
|
1349 |
-
|
1350 |
-
|
1351 |
-
|
1352 |
-
|
1353 |
-
|
1354 |
-
|
1355 |
-
|
1356 |
-
|
1357 |
-
|
1358 |
-
|
1359 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1360 |
|
1361 |
def get_translation_instructions(self):
|
1362 |
"""Get instructions for translation based on target language"""
|
@@ -1371,7 +1500,7 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1371 |
)
|
1372 |
|
1373 |
async def start_up(self):
|
1374 |
-
"""Connect to realtime API
|
1375 |
# First check if we have the most recent settings
|
1376 |
if connection_settings:
|
1377 |
recent_ids = sorted(connection_settings.keys(),
|
@@ -1394,147 +1523,91 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1394 |
print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
|
1395 |
f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
|
1396 |
f"interpretation_language={self.interpretation_language}")
|
|
|
1397 |
self.client = openai.AsyncOpenAI()
|
1398 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1399 |
# Define the web search function
|
1400 |
tools = []
|
1401 |
base_instructions = self.system_prompt or "You are a helpful assistant."
|
1402 |
|
1403 |
-
#
|
1404 |
-
|
1405 |
-
|
1406 |
-
|
1407 |
-
|
1408 |
-
|
1409 |
-
|
1410 |
-
|
1411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1412 |
)
|
1413 |
-
|
1414 |
-
tools = []
|
1415 |
-
print(f"Interpretation mode active - target language: {self.interpretation_language}")
|
1416 |
-
print(f"Instructions: {instructions}")
|
1417 |
else:
|
1418 |
-
|
1419 |
-
translation_instructions = self.get_translation_instructions()
|
1420 |
-
|
1421 |
-
if self.web_search_enabled and self.search_client:
|
1422 |
-
tools = [{
|
1423 |
-
"type": "function",
|
1424 |
-
"function": {
|
1425 |
-
"name": "web_search",
|
1426 |
-
"description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
|
1427 |
-
"parameters": {
|
1428 |
-
"type": "object",
|
1429 |
-
"properties": {
|
1430 |
-
"query": {
|
1431 |
-
"type": "string",
|
1432 |
-
"description": "The search query"
|
1433 |
-
}
|
1434 |
-
},
|
1435 |
-
"required": ["query"]
|
1436 |
-
}
|
1437 |
-
}
|
1438 |
-
}]
|
1439 |
-
print("Web search function added to tools")
|
1440 |
-
|
1441 |
-
search_instructions = (
|
1442 |
-
"\n\nYou have web search capabilities. "
|
1443 |
-
"IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
|
1444 |
-
"- Weather (๋ ์จ, ๊ธฐ์จ, ๋น, ๋)\n"
|
1445 |
-
"- News (๋ด์ค, ์์)\n"
|
1446 |
-
"- Current events (ํ์ฌ, ์ต๊ทผ, ์ค๋, ์ง๊ธ)\n"
|
1447 |
-
"- Prices (๊ฐ๊ฒฉ, ํ์จ, ์ฃผ๊ฐ)\n"
|
1448 |
-
"- Sports scores or results\n"
|
1449 |
-
"- Any question about 2024 or 2025\n"
|
1450 |
-
"- Any time-sensitive information\n\n"
|
1451 |
-
"When in doubt, USE web_search. It's better to search and provide accurate information "
|
1452 |
-
"than to guess or use outdated information."
|
1453 |
-
)
|
1454 |
-
instructions = base_instructions + search_instructions + translation_instructions
|
1455 |
-
else:
|
1456 |
-
instructions = base_instructions + translation_instructions
|
1457 |
|
1458 |
async with self.client.beta.realtime.connect(
|
1459 |
model="gpt-4o-mini-realtime-preview-2024-12-17"
|
1460 |
) as conn:
|
1461 |
# Update session with tools
|
1462 |
session_update = {
|
1463 |
-
"turn_detection": {
|
1464 |
-
"type": "server_vad",
|
1465 |
-
"threshold": 0.5,
|
1466 |
-
"prefix_padding_ms": 300,
|
1467 |
-
"silence_duration_ms": 100 if self.interpretation_mode else 700 # Even shorter silence
|
1468 |
-
},
|
1469 |
"instructions": instructions,
|
1470 |
"tools": tools,
|
1471 |
-
"tool_choice": "
|
1472 |
-
"temperature": 0.0 if self.interpretation_mode else 0.7,
|
1473 |
-
"max_response_output_tokens": 50 if self.interpretation_mode else 4096 # Very short responses
|
1474 |
}
|
1475 |
|
1476 |
-
# Add voice setting
|
1477 |
-
|
1478 |
-
|
1479 |
-
|
1480 |
-
|
1481 |
-
|
1482 |
-
|
1483 |
-
|
1484 |
-
|
1485 |
-
|
1486 |
-
|
1487 |
-
|
1488 |
-
# Use very explicit language instructions
|
1489 |
-
if self.interpretation_language == "en":
|
1490 |
-
lang_specific = "Respond in English only. ์์ด๋ก๋ง ๋ตํ์ธ์."
|
1491 |
-
elif self.interpretation_language == "es":
|
1492 |
-
lang_specific = "Respond in Spanish only. Solo responde en espaรฑol."
|
1493 |
-
elif self.interpretation_language == "fr":
|
1494 |
-
lang_specific = "Respond in French only. Rรฉpondez uniquement en franรงais."
|
1495 |
-
elif self.interpretation_language == "de":
|
1496 |
-
lang_specific = "Respond in German only. Antworten Sie nur auf Deutsch."
|
1497 |
-
elif self.interpretation_language == "ja":
|
1498 |
-
lang_specific = "Respond in Japanese only. ๆฅๆฌ่ชใงใฎใฟ็ญใใฆใใ ใใใ"
|
1499 |
-
elif self.interpretation_language == "zh":
|
1500 |
-
lang_specific = "Respond in Chinese only. ๅช็จไธญๆๅ็ญใ"
|
1501 |
-
else:
|
1502 |
-
lang_specific = f"Respond in {target_lang_name} only."
|
1503 |
-
|
1504 |
-
# Override instructions with ultra-specific directive
|
1505 |
-
session_update["instructions"] = (
|
1506 |
-
f"TRANSLATE TO {target_lang_name.upper()}. "
|
1507 |
-
f"{lang_specific} "
|
1508 |
-
f"You are a translation machine. "
|
1509 |
-
f"Input โ Translation. Nothing else. "
|
1510 |
-
f"Do not chat. Do not explain. Just translate. "
|
1511 |
-
f"Maximum 20 words per response."
|
1512 |
-
)
|
1513 |
-
|
1514 |
-
# Additional session parameters for interpretation mode
|
1515 |
-
session_update["input_audio_transcription"] = {
|
1516 |
-
"model": "whisper-1"
|
1517 |
}
|
1518 |
-
|
1519 |
-
print(f"[INTERPRETATION MODE] Target: {self.interpretation_language} ({target_lang_name})")
|
1520 |
-
print(f"[INTERPRETATION MODE] Instructions: {session_update['instructions']}")
|
1521 |
-
print(f"[INTERPRETATION MODE] Session config: {session_update}")
|
1522 |
-
elif voice_language:
|
1523 |
-
# Normal translation mode
|
1524 |
-
session_update["voice"] = "alloy"
|
1525 |
-
print(f"Voice set to: alloy for language: {voice_language}")
|
1526 |
|
1527 |
await conn.session.update(session=session_update)
|
1528 |
self.connection = conn
|
1529 |
-
print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}
|
1530 |
-
f"interpretation_mode: {self.interpretation_mode}, language: {self.interpretation_language if self.interpretation_mode else self.target_language}")
|
1531 |
|
1532 |
async for event in self.connection:
|
1533 |
-
# Debug logging for interpretation mode
|
1534 |
-
if self.interpretation_mode and event.type == "response.audio_transcript.done":
|
1535 |
-
print(f"[INTERPRETATION] Transcript: {event.transcript[:100]}...")
|
1536 |
-
print(f"[INTERPRETATION] Expected language: {self.interpretation_language}")
|
1537 |
-
|
1538 |
# Debug logging for function calls
|
1539 |
if event.type.startswith("response.function_call"):
|
1540 |
print(f"Function event: {event.type}")
|
@@ -1542,11 +1615,7 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1542 |
if event.type == "response.audio_transcript.done":
|
1543 |
output_data = {
|
1544 |
"event": event,
|
1545 |
-
"language": SUPPORTED_LANGUAGES.get(
|
1546 |
-
self.interpretation_language if self.interpretation_mode else self.target_language,
|
1547 |
-
""
|
1548 |
-
) if (self.interpretation_language or self.target_language) else "",
|
1549 |
-
"mode": "interpretation" if self.interpretation_mode else "normal"
|
1550 |
}
|
1551 |
await self.output_queue.put(AdditionalOutputs(output_data))
|
1552 |
|
@@ -1607,18 +1676,46 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1607 |
self.current_call_id = None
|
1608 |
|
1609 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
1610 |
-
if
|
1611 |
-
|
1612 |
-
try:
|
1613 |
_, array = frame
|
1614 |
array = array.squeeze()
|
1615 |
-
|
1616 |
-
|
1617 |
-
|
1618 |
-
|
1619 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1620 |
|
1621 |
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
|
|
|
|
|
|
|
|
|
1622 |
item = await wait_for_item(self.output_queue)
|
1623 |
|
1624 |
# Check if it's a dict with text message
|
@@ -1629,9 +1726,16 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1629 |
return item
|
1630 |
|
1631 |
async def shutdown(self) -> None:
|
1632 |
-
if self.
|
1633 |
-
|
1634 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1635 |
|
1636 |
|
1637 |
# Create initial handler instance
|
|
|
20 |
import httpx
|
21 |
from typing import Optional, List, Dict
|
22 |
import gradio as gr
|
23 |
+
import io
|
24 |
+
from scipy import signal
|
25 |
+
import wave
|
26 |
|
27 |
load_dotenv()
|
28 |
|
|
|
318 |
padding: 10px;
|
319 |
margin-bottom: 10px;
|
320 |
}
|
321 |
+
.message.assistant.interpretation {
|
322 |
+
background: linear-gradient(135deg, #1a5a3e, #2e7d32);
|
323 |
+
font-style: italic;
|
324 |
+
}
|
325 |
+
.interpretation-arrow {
|
326 |
+
color: #4caf50;
|
327 |
+
font-weight: bold;
|
328 |
+
margin: 0 10px;
|
329 |
+
}
|
330 |
.controls {
|
331 |
text-align: center;
|
332 |
margin-top: auto;
|
|
|
630 |
</div>
|
631 |
</div>
|
632 |
<div class="interpretation-info" id="interpretation-info" style="display: none;">
|
633 |
+
<strong>ํต์ญ ๋ชจ๋ ์๋ด:</strong><br>
|
634 |
+
โข ์์ฑ์ผ๋ก ๋งํ๋ฉด ์ ํํ ์ธ์ด๋ก ์๋ ํต์ญ๋ฉ๋๋ค<br>
|
635 |
+
โข Whisper + GPT-4o-mini + TTS๋ฅผ ์ฌ์ฉํฉ๋๋ค<br>
|
636 |
+
โข ๋ง์ ๋ง์น๊ณ ์ ์ ๊ธฐ๋ค๋ฆฌ๋ฉด ํต์ญ์ด ์์๋ฉ๋๋ค
|
637 |
</div>
|
638 |
<div class="text-input-section">
|
639 |
<label for="system-prompt" class="setting-label">์์คํ
ํ๋กฌํํธ:</label>
|
|
|
1064 |
if (selectedLanguage && eventJson.language) {
|
1065 |
content += ` <span class="language-info">[${eventJson.language}]</span>`;
|
1066 |
} else if (interpretationMode && eventJson.language) {
|
1067 |
+
// In interpretation mode, show the translation process
|
1068 |
+
if (content.includes('โ')) {
|
1069 |
+
// Format: "Korean text โ English text"
|
1070 |
+
const parts = content.split('โ');
|
1071 |
+
if (parts.length === 2) {
|
1072 |
+
content = `<span style="color: #999;">${parts[0].trim()}</span>` +
|
1073 |
+
`<span class="interpretation-arrow">โ</span>` +
|
1074 |
+
`<strong>${parts[1].trim()}</strong>`;
|
1075 |
+
}
|
1076 |
+
}
|
1077 |
content += ` <span class="language-info">[ํต์ญ: ${eventJson.language}]</span>`;
|
1078 |
}
|
1079 |
addMessage("assistant", content);
|
|
|
1094 |
function addMessage(role, content) {
|
1095 |
const messageDiv = document.createElement('div');
|
1096 |
messageDiv.classList.add('message', role);
|
1097 |
+
|
1098 |
+
// Check if it's an interpretation message
|
1099 |
+
if (interpretationMode && role === 'assistant' && content.includes('โ')) {
|
1100 |
+
messageDiv.classList.add('interpretation');
|
1101 |
+
}
|
1102 |
+
|
1103 |
if (content.includes('<span')) {
|
1104 |
messageDiv.innerHTML = content;
|
1105 |
} else {
|
|
|
1304 |
self.system_prompt = system_prompt
|
1305 |
self.interpretation_mode = interpretation_mode
|
1306 |
self.interpretation_language = interpretation_language
|
1307 |
+
|
1308 |
+
# For interpretation mode
|
1309 |
+
self.audio_buffer = []
|
1310 |
+
self.is_recording = False
|
1311 |
+
self.silence_frames = 0
|
1312 |
+
self.silence_threshold = 30 # Number of silent frames before stopping
|
1313 |
+
|
1314 |
print(f"Handler created with web_search_enabled={web_search_enabled}, "
|
1315 |
f"target_language={target_language}, webrtc_id={webrtc_id}, "
|
1316 |
f"interpretation_mode={interpretation_mode}, interpretation_language={interpretation_language}")
|
|
|
1335 |
)
|
1336 |
|
1337 |
print(f"Handler.copy() called - creating new handler with default settings")
|
1338 |
+
return OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
|
1339 |
|
1340 |
async def search_web(self, query: str) -> str:
|
1341 |
"""Perform web search and return formatted results"""
|
|
|
1370 |
)
|
1371 |
await self.connection.response.create()
|
1372 |
|
1373 |
+
async def process_interpretation(self):
|
1374 |
+
"""Process audio buffer for interpretation"""
|
1375 |
+
if not self.audio_buffer or not self.interpretation_language:
|
1376 |
+
return
|
|
|
|
|
|
|
1377 |
|
1378 |
+
try:
|
1379 |
+
print(f"[INTERPRETATION] Processing audio buffer with {len(self.audio_buffer)} frames")
|
1380 |
+
|
1381 |
+
# Convert audio buffer to WAV format
|
1382 |
+
audio_data = np.concatenate(self.audio_buffer)
|
1383 |
+
|
1384 |
+
# Create WAV file in memory
|
1385 |
+
wav_buffer = io.BytesIO()
|
1386 |
+
with wave.open(wav_buffer, 'wb') as wav_file:
|
1387 |
+
wav_file.setnchannels(1) # Mono
|
1388 |
+
wav_file.setsampwidth(2) # 16-bit
|
1389 |
+
wav_file.setframerate(SAMPLE_RATE)
|
1390 |
+
wav_file.writeframes(audio_data.tobytes())
|
1391 |
+
|
1392 |
+
wav_buffer.seek(0)
|
1393 |
+
wav_buffer.name = "audio.wav"
|
1394 |
+
|
1395 |
+
# 1. Transcribe with Whisper
|
1396 |
+
print("[INTERPRETATION] Transcribing with Whisper...")
|
1397 |
+
transcript = await self.client.audio.transcriptions.create(
|
1398 |
+
model="whisper-1",
|
1399 |
+
file=wav_buffer,
|
1400 |
+
language="ko" # Assuming Korean input
|
1401 |
+
)
|
1402 |
+
|
1403 |
+
user_text = transcript.text.strip()
|
1404 |
+
print(f"[INTERPRETATION] Transcribed: {user_text}")
|
1405 |
+
|
1406 |
+
if not user_text:
|
1407 |
+
return
|
1408 |
+
|
1409 |
+
# 2. Translate with GPT-4o-mini
|
1410 |
+
target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
|
1411 |
+
|
1412 |
+
print(f"[INTERPRETATION] Translating to {target_lang_name}...")
|
1413 |
+
translation_response = await self.client.chat.completions.create(
|
1414 |
+
model="gpt-4o-mini",
|
1415 |
+
messages=[
|
1416 |
+
{
|
1417 |
+
"role": "system",
|
1418 |
+
"content": f"You are a translator. Translate the following text to {target_lang_name}. "
|
1419 |
+
f"Provide only the translation, nothing else."
|
1420 |
+
},
|
1421 |
+
{
|
1422 |
+
"role": "user",
|
1423 |
+
"content": user_text
|
1424 |
+
}
|
1425 |
+
],
|
1426 |
+
temperature=0.3,
|
1427 |
+
max_tokens=200
|
1428 |
+
)
|
1429 |
+
|
1430 |
+
translated_text = translation_response.choices[0].message.content.strip()
|
1431 |
+
print(f"[INTERPRETATION] Translated: {translated_text}")
|
1432 |
+
|
1433 |
+
# 3. Generate speech with TTS
|
1434 |
+
print("[INTERPRETATION] Generating speech...")
|
1435 |
+
tts_response = await self.client.audio.speech.create(
|
1436 |
+
model="tts-1",
|
1437 |
+
voice="alloy",
|
1438 |
+
input=translated_text,
|
1439 |
+
response_format="pcm", # PCM format for direct playback
|
1440 |
+
speed=1.0
|
1441 |
+
)
|
1442 |
+
|
1443 |
+
# Convert response to bytes
|
1444 |
+
audio_bytes = b""
|
1445 |
+
async for chunk in tts_response.iter_bytes(1024):
|
1446 |
+
audio_bytes += chunk
|
1447 |
+
|
1448 |
+
# Convert PCM to numpy array
|
1449 |
+
audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
|
1450 |
+
|
1451 |
+
# Resample from 24kHz (TTS output) to our sample rate if needed
|
1452 |
+
if len(audio_array) > 0:
|
1453 |
+
# Split audio into chunks and send
|
1454 |
+
chunk_size = 480 # Match our frame size
|
1455 |
+
for i in range(0, len(audio_array), chunk_size):
|
1456 |
+
chunk = audio_array[i:i + chunk_size]
|
1457 |
+
if len(chunk) < chunk_size:
|
1458 |
+
# Pad the last chunk if necessary
|
1459 |
+
chunk = np.pad(chunk, (0, chunk_size - len(chunk)), 'constant')
|
1460 |
+
|
1461 |
+
await self.output_queue.put((SAMPLE_RATE, chunk.reshape(1, -1)))
|
1462 |
+
|
1463 |
+
# Send transcript event
|
1464 |
+
output_data = {
|
1465 |
+
"event": type('Event', (), {
|
1466 |
+
'transcript': f"{user_text} โ {translated_text}"
|
1467 |
+
})(),
|
1468 |
+
"language": target_lang_name,
|
1469 |
+
"mode": "interpretation"
|
1470 |
+
}
|
1471 |
+
await self.output_queue.put(AdditionalOutputs(output_data))
|
1472 |
+
|
1473 |
+
except Exception as e:
|
1474 |
+
print(f"[INTERPRETATION] Error: {e}")
|
1475 |
+
# Send error message to client
|
1476 |
+
error_data = {
|
1477 |
+
"event": type('Event', (), {
|
1478 |
+
'transcript': f"ํต์ญ ์ค๋ฅ: {str(e)}"
|
1479 |
+
})(),
|
1480 |
+
"language": "",
|
1481 |
+
"mode": "error"
|
1482 |
+
}
|
1483 |
+
await self.output_queue.put(AdditionalOutputs(error_data))
|
1484 |
+
finally:
|
1485 |
+
# Clear the audio buffer
|
1486 |
+
self.audio_buffer = []
|
1487 |
+
self.is_recording = False
|
1488 |
+
self.silence_frames = 0
|
1489 |
|
1490 |
def get_translation_instructions(self):
|
1491 |
"""Get instructions for translation based on target language"""
|
|
|
1500 |
)
|
1501 |
|
1502 |
async def start_up(self):
|
1503 |
+
"""Connect to realtime API or setup interpretation mode"""
|
1504 |
# First check if we have the most recent settings
|
1505 |
if connection_settings:
|
1506 |
recent_ids = sorted(connection_settings.keys(),
|
|
|
1523 |
print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
|
1524 |
f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
|
1525 |
f"interpretation_language={self.interpretation_language}")
|
1526 |
+
|
1527 |
self.client = openai.AsyncOpenAI()
|
1528 |
|
1529 |
+
# If in interpretation mode, don't connect to Realtime API
|
1530 |
+
if self.interpretation_mode:
|
1531 |
+
print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
|
1532 |
+
print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
|
1533 |
+
# Just keep the handler ready to process audio
|
1534 |
+
return
|
1535 |
+
|
1536 |
+
# Normal mode - connect to Realtime API
|
1537 |
# Define the web search function
|
1538 |
tools = []
|
1539 |
base_instructions = self.system_prompt or "You are a helpful assistant."
|
1540 |
|
1541 |
+
# Add translation instructions if language is selected
|
1542 |
+
translation_instructions = self.get_translation_instructions()
|
1543 |
+
|
1544 |
+
if self.web_search_enabled and self.search_client:
|
1545 |
+
tools = [{
|
1546 |
+
"type": "function",
|
1547 |
+
"function": {
|
1548 |
+
"name": "web_search",
|
1549 |
+
"description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
|
1550 |
+
"parameters": {
|
1551 |
+
"type": "object",
|
1552 |
+
"properties": {
|
1553 |
+
"query": {
|
1554 |
+
"type": "string",
|
1555 |
+
"description": "The search query"
|
1556 |
+
}
|
1557 |
+
},
|
1558 |
+
"required": ["query"]
|
1559 |
+
}
|
1560 |
+
}
|
1561 |
+
}]
|
1562 |
+
print("Web search function added to tools")
|
1563 |
+
|
1564 |
+
search_instructions = (
|
1565 |
+
"\n\nYou have web search capabilities. "
|
1566 |
+
"IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
|
1567 |
+
"- Weather (๋ ์จ, ๊ธฐ์จ, ๋น, ๋)\n"
|
1568 |
+
"- News (๋ด์ค, ์์)\n"
|
1569 |
+
"- Current events (ํ์ฌ, ์ต๊ทผ, ์ค๋, ์ง๊ธ)\n"
|
1570 |
+
"- Prices (๊ฐ๊ฒฉ, ํ์จ, ์ฃผ๊ฐ)\n"
|
1571 |
+
"- Sports scores or results\n"
|
1572 |
+
"- Any question about 2024 or 2025\n"
|
1573 |
+
"- Any time-sensitive information\n\n"
|
1574 |
+
"When in doubt, USE web_search. It's better to search and provide accurate information "
|
1575 |
+
"than to guess or use outdated information."
|
1576 |
)
|
1577 |
+
instructions = base_instructions + search_instructions + translation_instructions
|
|
|
|
|
|
|
1578 |
else:
|
1579 |
+
instructions = base_instructions + translation_instructions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1580 |
|
1581 |
async with self.client.beta.realtime.connect(
|
1582 |
model="gpt-4o-mini-realtime-preview-2024-12-17"
|
1583 |
) as conn:
|
1584 |
# Update session with tools
|
1585 |
session_update = {
|
1586 |
+
"turn_detection": {"type": "server_vad"},
|
|
|
|
|
|
|
|
|
|
|
1587 |
"instructions": instructions,
|
1588 |
"tools": tools,
|
1589 |
+
"tool_choice": "auto" if tools else "none"
|
|
|
|
|
1590 |
}
|
1591 |
|
1592 |
+
# Add voice setting if target language is selected
|
1593 |
+
if self.target_language:
|
1594 |
+
# Map languages to appropriate voices
|
1595 |
+
voice_map = {
|
1596 |
+
"en": "alloy",
|
1597 |
+
"es": "nova",
|
1598 |
+
"fr": "nova",
|
1599 |
+
"de": "nova",
|
1600 |
+
"ja": "nova",
|
1601 |
+
"zh": "nova",
|
1602 |
+
# Default to alloy for other languages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1603 |
}
|
1604 |
+
session_update["voice"] = voice_map.get(self.target_language, "alloy")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1605 |
|
1606 |
await conn.session.update(session=session_update)
|
1607 |
self.connection = conn
|
1608 |
+
print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
|
|
|
1609 |
|
1610 |
async for event in self.connection:
|
|
|
|
|
|
|
|
|
|
|
1611 |
# Debug logging for function calls
|
1612 |
if event.type.startswith("response.function_call"):
|
1613 |
print(f"Function event: {event.type}")
|
|
|
1615 |
if event.type == "response.audio_transcript.done":
|
1616 |
output_data = {
|
1617 |
"event": event,
|
1618 |
+
"language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
|
|
|
|
|
|
|
|
|
1619 |
}
|
1620 |
await self.output_queue.put(AdditionalOutputs(output_data))
|
1621 |
|
|
|
1676 |
self.current_call_id = None
|
1677 |
|
1678 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
1679 |
+
if self.interpretation_mode:
|
1680 |
+
# In interpretation mode, buffer audio and process with Whisper
|
|
|
1681 |
_, array = frame
|
1682 |
array = array.squeeze()
|
1683 |
+
|
1684 |
+
# Simple voice activity detection
|
1685 |
+
audio_level = np.abs(array).mean()
|
1686 |
+
|
1687 |
+
if audio_level > 300: # Lowered threshold for better detection
|
1688 |
+
self.is_recording = True
|
1689 |
+
self.silence_frames = 0
|
1690 |
+
self.audio_buffer.append(array)
|
1691 |
+
if len(self.audio_buffer) % 10 == 0: # Log every 10 frames
|
1692 |
+
print(f"[INTERPRETATION] Recording... buffer size: {len(self.audio_buffer)}, level: {audio_level:.1f}")
|
1693 |
+
elif self.is_recording:
|
1694 |
+
self.silence_frames += 1
|
1695 |
+
self.audio_buffer.append(array)
|
1696 |
+
|
1697 |
+
# If we've had enough silence, process the audio
|
1698 |
+
if self.silence_frames > self.silence_threshold:
|
1699 |
+
print(f"[INTERPRETATION] Silence detected, processing {len(self.audio_buffer)} frames")
|
1700 |
+
await self.process_interpretation()
|
1701 |
+
else:
|
1702 |
+
# Normal mode - use Realtime API
|
1703 |
+
if not self.connection:
|
1704 |
+
return
|
1705 |
+
try:
|
1706 |
+
_, array = frame
|
1707 |
+
array = array.squeeze()
|
1708 |
+
audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
|
1709 |
+
await self.connection.input_audio_buffer.append(audio=audio_message)
|
1710 |
+
except Exception as e:
|
1711 |
+
print(f"Error in receive: {e}")
|
1712 |
+
# Connection might be closed, ignore the error
|
1713 |
|
1714 |
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
|
1715 |
+
# In interpretation mode, check if we need to process buffered audio
|
1716 |
+
if self.interpretation_mode and self.is_recording and self.silence_frames > self.silence_threshold:
|
1717 |
+
await self.process_interpretation()
|
1718 |
+
|
1719 |
item = await wait_for_item(self.output_queue)
|
1720 |
|
1721 |
# Check if it's a dict with text message
|
|
|
1726 |
return item
|
1727 |
|
1728 |
async def shutdown(self) -> None:
|
1729 |
+
if self.interpretation_mode:
|
1730 |
+
# Clean up interpretation mode
|
1731 |
+
self.audio_buffer = []
|
1732 |
+
self.is_recording = False
|
1733 |
+
print("[INTERPRETATION MODE] Shutdown complete")
|
1734 |
+
else:
|
1735 |
+
# Normal mode - close Realtime API connection
|
1736 |
+
if self.connection:
|
1737 |
+
await self.connection.close()
|
1738 |
+
self.connection = None
|
1739 |
|
1740 |
|
1741 |
# Create initial handler instance
|