seawolf2357 commited on
Commit
24f7b71
ยท
verified ยท
1 Parent(s): 17b6035

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +264 -160
app.py CHANGED
@@ -20,6 +20,9 @@ from openai.types.beta.realtime import ResponseAudioTranscriptDoneEvent
20
  import httpx
21
  from typing import Optional, List, Dict
22
  import gradio as gr
 
 
 
23
 
24
  load_dotenv()
25
 
@@ -315,6 +318,15 @@ HTML_CONTENT = """<!DOCTYPE html>
315
  padding: 10px;
316
  margin-bottom: 10px;
317
  }
 
 
 
 
 
 
 
 
 
318
  .controls {
319
  text-align: center;
320
  margin-top: auto;
@@ -618,7 +630,10 @@ HTML_CONTENT = """<!DOCTYPE html>
618
  </div>
619
  </div>
620
  <div class="interpretation-info" id="interpretation-info" style="display: none;">
621
- ํ†ต์—ญ ๋ชจ๋“œ: ์ž…๋ ฅํ•œ ์Œ์„ฑ์ด ์„ ํƒํ•œ ์–ธ์–ด๋กœ ์ž๋™ ํ†ต์—ญ๋ฉ๋‹ˆ๋‹ค.
 
 
 
622
  </div>
623
  <div class="text-input-section">
624
  <label for="system-prompt" class="setting-label">์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ:</label>
@@ -1049,6 +1064,16 @@ HTML_CONTENT = """<!DOCTYPE html>
1049
  if (selectedLanguage && eventJson.language) {
1050
  content += ` <span class="language-info">[${eventJson.language}]</span>`;
1051
  } else if (interpretationMode && eventJson.language) {
 
 
 
 
 
 
 
 
 
 
1052
  content += ` <span class="language-info">[ํ†ต์—ญ: ${eventJson.language}]</span>`;
1053
  }
1054
  addMessage("assistant", content);
@@ -1069,6 +1094,12 @@ HTML_CONTENT = """<!DOCTYPE html>
1069
  function addMessage(role, content) {
1070
  const messageDiv = document.createElement('div');
1071
  messageDiv.classList.add('message', role);
 
 
 
 
 
 
1072
  if (content.includes('<span')) {
1073
  messageDiv.innerHTML = content;
1074
  } else {
@@ -1273,6 +1304,13 @@ class OpenAIHandler(AsyncStreamHandler):
1273
  self.system_prompt = system_prompt
1274
  self.interpretation_mode = interpretation_mode
1275
  self.interpretation_language = interpretation_language
 
 
 
 
 
 
 
1276
  print(f"Handler created with web_search_enabled={web_search_enabled}, "
1277
  f"target_language={target_language}, webrtc_id={webrtc_id}, "
1278
  f"interpretation_mode={interpretation_mode}, interpretation_language={interpretation_language}")
@@ -1297,7 +1335,7 @@ class OpenAIHandler(AsyncStreamHandler):
1297
  )
1298
 
1299
  print(f"Handler.copy() called - creating new handler with default settings")
1300
- return OpenAIHandler(web_search_enabled=False)
1301
 
1302
  async def search_web(self, query: str) -> str:
1303
  """Perform web search and return formatted results"""
@@ -1332,31 +1370,122 @@ class OpenAIHandler(AsyncStreamHandler):
1332
  )
1333
  await self.connection.response.create()
1334
 
1335
- def get_interpretation_instructions(self):
1336
- """Get instructions for interpretation mode"""
1337
- if not self.interpretation_mode or not self.interpretation_language:
1338
- return ""
1339
-
1340
- target_language_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
1341
- target_code = self.interpretation_language
1342
 
1343
- return (
1344
- f"YOU ARE IN TRANSLATION MODE. YOUR ONLY FUNCTION IS TO TRANSLATE.\n\n"
1345
- f"RULES:\n"
1346
- f"1. TRANSLATE the input to {target_language_name} ({target_code})\n"
1347
- f"2. OUTPUT ONLY THE TRANSLATION\n"
1348
- f"3. DO NOT ANSWER QUESTIONS\n"
1349
- f"4. DO NOT PROVIDE INFORMATION\n"
1350
- f"5. DO NOT CONTINUE SPEAKING\n"
1351
- f"6. NEVER OUTPUT IN ANY OTHER LANGUAGE THAN {target_language_name}\n\n"
1352
- f"EXAMPLES:\n"
1353
- f"Input: '์•ˆ๋…•ํ•˜์„ธ์š”' โ†’ Output: 'Hello' (if English)\n"
1354
- f"Input: '๋‚ ์”จ๊ฐ€ ์–ด๋•Œ์š”?' โ†’ Output: 'How is the weather?' (if English)\n"
1355
- f"Input: '์ปคํ”ผ ํ•œ ์ž” ์ฃผ์„ธ์š”' โ†’ Output: 'One coffee please' (if English)\n\n"
1356
- f"YOU ARE NOT AN AI ASSISTANT. YOU ARE A TRANSLATOR.\n"
1357
- f"TRANSLATE AND STOP. NO ADDITIONAL WORDS.\n"
1358
- f"OUTPUT LANGUAGE: {target_language_name} ONLY."
1359
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1360
 
1361
  def get_translation_instructions(self):
1362
  """Get instructions for translation based on target language"""
@@ -1371,7 +1500,7 @@ class OpenAIHandler(AsyncStreamHandler):
1371
  )
1372
 
1373
  async def start_up(self):
1374
- """Connect to realtime API with function calling enabled"""
1375
  # First check if we have the most recent settings
1376
  if connection_settings:
1377
  recent_ids = sorted(connection_settings.keys(),
@@ -1394,147 +1523,91 @@ class OpenAIHandler(AsyncStreamHandler):
1394
  print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
1395
  f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
1396
  f"interpretation_language={self.interpretation_language}")
 
1397
  self.client = openai.AsyncOpenAI()
1398
 
 
 
 
 
 
 
 
 
1399
  # Define the web search function
1400
  tools = []
1401
  base_instructions = self.system_prompt or "You are a helpful assistant."
1402
 
1403
- # Check if in interpretation mode
1404
- if self.interpretation_mode:
1405
- # In interpretation mode, completely override everything
1406
- instructions = (
1407
- f"CRITICAL: YOU ARE A TRANSLATION SERVICE.\n"
1408
- f"TRANSLATE USER INPUT TO {SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)}.\n"
1409
- f"OUTPUT ONLY THE TRANSLATION. NOTHING ELSE.\n"
1410
- f"DO NOT ACT AS AN AI. DO NOT ANSWER QUESTIONS.\n"
1411
- f"JUST TRANSLATE AND STOP."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1412
  )
1413
- # No tools in interpretation mode
1414
- tools = []
1415
- print(f"Interpretation mode active - target language: {self.interpretation_language}")
1416
- print(f"Instructions: {instructions}")
1417
  else:
1418
- # Normal mode - add translation instructions if language is selected
1419
- translation_instructions = self.get_translation_instructions()
1420
-
1421
- if self.web_search_enabled and self.search_client:
1422
- tools = [{
1423
- "type": "function",
1424
- "function": {
1425
- "name": "web_search",
1426
- "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
1427
- "parameters": {
1428
- "type": "object",
1429
- "properties": {
1430
- "query": {
1431
- "type": "string",
1432
- "description": "The search query"
1433
- }
1434
- },
1435
- "required": ["query"]
1436
- }
1437
- }
1438
- }]
1439
- print("Web search function added to tools")
1440
-
1441
- search_instructions = (
1442
- "\n\nYou have web search capabilities. "
1443
- "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
1444
- "- Weather (๋‚ ์”จ, ๊ธฐ์˜จ, ๋น„, ๋ˆˆ)\n"
1445
- "- News (๋‰ด์Šค, ์†Œ์‹)\n"
1446
- "- Current events (ํ˜„์žฌ, ์ตœ๊ทผ, ์˜ค๋Š˜, ์ง€๊ธˆ)\n"
1447
- "- Prices (๊ฐ€๊ฒฉ, ํ™˜์œจ, ์ฃผ๊ฐ€)\n"
1448
- "- Sports scores or results\n"
1449
- "- Any question about 2024 or 2025\n"
1450
- "- Any time-sensitive information\n\n"
1451
- "When in doubt, USE web_search. It's better to search and provide accurate information "
1452
- "than to guess or use outdated information."
1453
- )
1454
- instructions = base_instructions + search_instructions + translation_instructions
1455
- else:
1456
- instructions = base_instructions + translation_instructions
1457
 
1458
  async with self.client.beta.realtime.connect(
1459
  model="gpt-4o-mini-realtime-preview-2024-12-17"
1460
  ) as conn:
1461
  # Update session with tools
1462
  session_update = {
1463
- "turn_detection": {
1464
- "type": "server_vad",
1465
- "threshold": 0.5,
1466
- "prefix_padding_ms": 300,
1467
- "silence_duration_ms": 100 if self.interpretation_mode else 700 # Even shorter silence
1468
- },
1469
  "instructions": instructions,
1470
  "tools": tools,
1471
- "tool_choice": "none" if self.interpretation_mode else ("auto" if tools else "none"),
1472
- "temperature": 0.0 if self.interpretation_mode else 0.7,
1473
- "max_response_output_tokens": 50 if self.interpretation_mode else 4096 # Very short responses
1474
  }
1475
 
1476
- # Add voice setting based on interpretation or translation language
1477
- voice_language = self.interpretation_language if self.interpretation_mode else self.target_language
1478
-
1479
- # For interpretation mode, ensure proper settings
1480
- if self.interpretation_mode and self.interpretation_language:
1481
- # Force simple translation behavior
1482
- session_update["voice"] = "alloy"
1483
- session_update["modalities"] = ["audio", "text"] # Changed order
1484
-
1485
- # Create a very specific prompt for the target language
1486
- target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
1487
-
1488
- # Use very explicit language instructions
1489
- if self.interpretation_language == "en":
1490
- lang_specific = "Respond in English only. ์˜์–ด๋กœ๋งŒ ๋‹ตํ•˜์„ธ์š”."
1491
- elif self.interpretation_language == "es":
1492
- lang_specific = "Respond in Spanish only. Solo responde en espaรฑol."
1493
- elif self.interpretation_language == "fr":
1494
- lang_specific = "Respond in French only. Rรฉpondez uniquement en franรงais."
1495
- elif self.interpretation_language == "de":
1496
- lang_specific = "Respond in German only. Antworten Sie nur auf Deutsch."
1497
- elif self.interpretation_language == "ja":
1498
- lang_specific = "Respond in Japanese only. ๆ—ฅๆœฌ่ชžใงใฎใฟ็ญ”ใˆใฆใใ ใ•ใ„ใ€‚"
1499
- elif self.interpretation_language == "zh":
1500
- lang_specific = "Respond in Chinese only. ๅช็”จไธญๆ–‡ๅ›ž็ญ”ใ€‚"
1501
- else:
1502
- lang_specific = f"Respond in {target_lang_name} only."
1503
-
1504
- # Override instructions with ultra-specific directive
1505
- session_update["instructions"] = (
1506
- f"TRANSLATE TO {target_lang_name.upper()}. "
1507
- f"{lang_specific} "
1508
- f"You are a translation machine. "
1509
- f"Input โ†’ Translation. Nothing else. "
1510
- f"Do not chat. Do not explain. Just translate. "
1511
- f"Maximum 20 words per response."
1512
- )
1513
-
1514
- # Additional session parameters for interpretation mode
1515
- session_update["input_audio_transcription"] = {
1516
- "model": "whisper-1"
1517
  }
1518
-
1519
- print(f"[INTERPRETATION MODE] Target: {self.interpretation_language} ({target_lang_name})")
1520
- print(f"[INTERPRETATION MODE] Instructions: {session_update['instructions']}")
1521
- print(f"[INTERPRETATION MODE] Session config: {session_update}")
1522
- elif voice_language:
1523
- # Normal translation mode
1524
- session_update["voice"] = "alloy"
1525
- print(f"Voice set to: alloy for language: {voice_language}")
1526
 
1527
  await conn.session.update(session=session_update)
1528
  self.connection = conn
1529
- print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}, "
1530
- f"interpretation_mode: {self.interpretation_mode}, language: {self.interpretation_language if self.interpretation_mode else self.target_language}")
1531
 
1532
  async for event in self.connection:
1533
- # Debug logging for interpretation mode
1534
- if self.interpretation_mode and event.type == "response.audio_transcript.done":
1535
- print(f"[INTERPRETATION] Transcript: {event.transcript[:100]}...")
1536
- print(f"[INTERPRETATION] Expected language: {self.interpretation_language}")
1537
-
1538
  # Debug logging for function calls
1539
  if event.type.startswith("response.function_call"):
1540
  print(f"Function event: {event.type}")
@@ -1542,11 +1615,7 @@ class OpenAIHandler(AsyncStreamHandler):
1542
  if event.type == "response.audio_transcript.done":
1543
  output_data = {
1544
  "event": event,
1545
- "language": SUPPORTED_LANGUAGES.get(
1546
- self.interpretation_language if self.interpretation_mode else self.target_language,
1547
- ""
1548
- ) if (self.interpretation_language or self.target_language) else "",
1549
- "mode": "interpretation" if self.interpretation_mode else "normal"
1550
  }
1551
  await self.output_queue.put(AdditionalOutputs(output_data))
1552
 
@@ -1607,18 +1676,46 @@ class OpenAIHandler(AsyncStreamHandler):
1607
  self.current_call_id = None
1608
 
1609
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
1610
- if not self.connection:
1611
- return
1612
- try:
1613
  _, array = frame
1614
  array = array.squeeze()
1615
- audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
1616
- await self.connection.input_audio_buffer.append(audio=audio_message)
1617
- except Exception as e:
1618
- print(f"Error in receive: {e}")
1619
- # Connection might be closed, ignore the error
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1620
 
1621
  async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
 
 
 
 
1622
  item = await wait_for_item(self.output_queue)
1623
 
1624
  # Check if it's a dict with text message
@@ -1629,9 +1726,16 @@ class OpenAIHandler(AsyncStreamHandler):
1629
  return item
1630
 
1631
  async def shutdown(self) -> None:
1632
- if self.connection:
1633
- await self.connection.close()
1634
- self.connection = None
 
 
 
 
 
 
 
1635
 
1636
 
1637
  # Create initial handler instance
 
20
  import httpx
21
  from typing import Optional, List, Dict
22
  import gradio as gr
23
+ import io
24
+ from scipy import signal
25
+ import wave
26
 
27
  load_dotenv()
28
 
 
318
  padding: 10px;
319
  margin-bottom: 10px;
320
  }
321
+ .message.assistant.interpretation {
322
+ background: linear-gradient(135deg, #1a5a3e, #2e7d32);
323
+ font-style: italic;
324
+ }
325
+ .interpretation-arrow {
326
+ color: #4caf50;
327
+ font-weight: bold;
328
+ margin: 0 10px;
329
+ }
330
  .controls {
331
  text-align: center;
332
  margin-top: auto;
 
630
  </div>
631
  </div>
632
  <div class="interpretation-info" id="interpretation-info" style="display: none;">
633
+ <strong>ํ†ต์—ญ ๋ชจ๋“œ ์•ˆ๋‚ด:</strong><br>
634
+ โ€ข ์Œ์„ฑ์œผ๋กœ ๋งํ•˜๋ฉด ์„ ํƒํ•œ ์–ธ์–ด๋กœ ์ž๋™ ํ†ต์—ญ๋ฉ๋‹ˆ๋‹ค<br>
635
+ โ€ข Whisper + GPT-4o-mini + TTS๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค<br>
636
+ โ€ข ๋ง์„ ๋งˆ์น˜๊ณ  ์ž ์‹œ ๊ธฐ๋‹ค๋ฆฌ๋ฉด ํ†ต์—ญ์ด ์‹œ์ž‘๋ฉ๋‹ˆ๋‹ค
637
  </div>
638
  <div class="text-input-section">
639
  <label for="system-prompt" class="setting-label">์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ:</label>
 
1064
  if (selectedLanguage && eventJson.language) {
1065
  content += ` <span class="language-info">[${eventJson.language}]</span>`;
1066
  } else if (interpretationMode && eventJson.language) {
1067
+ // In interpretation mode, show the translation process
1068
+ if (content.includes('โ†’')) {
1069
+ // Format: "Korean text โ†’ English text"
1070
+ const parts = content.split('โ†’');
1071
+ if (parts.length === 2) {
1072
+ content = `<span style="color: #999;">${parts[0].trim()}</span>` +
1073
+ `<span class="interpretation-arrow">โ†’</span>` +
1074
+ `<strong>${parts[1].trim()}</strong>`;
1075
+ }
1076
+ }
1077
  content += ` <span class="language-info">[ํ†ต์—ญ: ${eventJson.language}]</span>`;
1078
  }
1079
  addMessage("assistant", content);
 
1094
  function addMessage(role, content) {
1095
  const messageDiv = document.createElement('div');
1096
  messageDiv.classList.add('message', role);
1097
+
1098
+ // Check if it's an interpretation message
1099
+ if (interpretationMode && role === 'assistant' && content.includes('โ†’')) {
1100
+ messageDiv.classList.add('interpretation');
1101
+ }
1102
+
1103
  if (content.includes('<span')) {
1104
  messageDiv.innerHTML = content;
1105
  } else {
 
1304
  self.system_prompt = system_prompt
1305
  self.interpretation_mode = interpretation_mode
1306
  self.interpretation_language = interpretation_language
1307
+
1308
+ # For interpretation mode
1309
+ self.audio_buffer = []
1310
+ self.is_recording = False
1311
+ self.silence_frames = 0
1312
+ self.silence_threshold = 30 # Number of silent frames before stopping
1313
+
1314
  print(f"Handler created with web_search_enabled={web_search_enabled}, "
1315
  f"target_language={target_language}, webrtc_id={webrtc_id}, "
1316
  f"interpretation_mode={interpretation_mode}, interpretation_language={interpretation_language}")
 
1335
  )
1336
 
1337
  print(f"Handler.copy() called - creating new handler with default settings")
1338
+ return OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
1339
 
1340
  async def search_web(self, query: str) -> str:
1341
  """Perform web search and return formatted results"""
 
1370
  )
1371
  await self.connection.response.create()
1372
 
1373
+ async def process_interpretation(self):
1374
+ """Process audio buffer for interpretation"""
1375
+ if not self.audio_buffer or not self.interpretation_language:
1376
+ return
 
 
 
1377
 
1378
+ try:
1379
+ print(f"[INTERPRETATION] Processing audio buffer with {len(self.audio_buffer)} frames")
1380
+
1381
+ # Convert audio buffer to WAV format
1382
+ audio_data = np.concatenate(self.audio_buffer)
1383
+
1384
+ # Create WAV file in memory
1385
+ wav_buffer = io.BytesIO()
1386
+ with wave.open(wav_buffer, 'wb') as wav_file:
1387
+ wav_file.setnchannels(1) # Mono
1388
+ wav_file.setsampwidth(2) # 16-bit
1389
+ wav_file.setframerate(SAMPLE_RATE)
1390
+ wav_file.writeframes(audio_data.tobytes())
1391
+
1392
+ wav_buffer.seek(0)
1393
+ wav_buffer.name = "audio.wav"
1394
+
1395
+ # 1. Transcribe with Whisper
1396
+ print("[INTERPRETATION] Transcribing with Whisper...")
1397
+ transcript = await self.client.audio.transcriptions.create(
1398
+ model="whisper-1",
1399
+ file=wav_buffer,
1400
+ language="ko" # Assuming Korean input
1401
+ )
1402
+
1403
+ user_text = transcript.text.strip()
1404
+ print(f"[INTERPRETATION] Transcribed: {user_text}")
1405
+
1406
+ if not user_text:
1407
+ return
1408
+
1409
+ # 2. Translate with GPT-4o-mini
1410
+ target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
1411
+
1412
+ print(f"[INTERPRETATION] Translating to {target_lang_name}...")
1413
+ translation_response = await self.client.chat.completions.create(
1414
+ model="gpt-4o-mini",
1415
+ messages=[
1416
+ {
1417
+ "role": "system",
1418
+ "content": f"You are a translator. Translate the following text to {target_lang_name}. "
1419
+ f"Provide only the translation, nothing else."
1420
+ },
1421
+ {
1422
+ "role": "user",
1423
+ "content": user_text
1424
+ }
1425
+ ],
1426
+ temperature=0.3,
1427
+ max_tokens=200
1428
+ )
1429
+
1430
+ translated_text = translation_response.choices[0].message.content.strip()
1431
+ print(f"[INTERPRETATION] Translated: {translated_text}")
1432
+
1433
+ # 3. Generate speech with TTS
1434
+ print("[INTERPRETATION] Generating speech...")
1435
+ tts_response = await self.client.audio.speech.create(
1436
+ model="tts-1",
1437
+ voice="alloy",
1438
+ input=translated_text,
1439
+ response_format="pcm", # PCM format for direct playback
1440
+ speed=1.0
1441
+ )
1442
+
1443
+ # Convert response to bytes
1444
+ audio_bytes = b""
1445
+ async for chunk in tts_response.iter_bytes(1024):
1446
+ audio_bytes += chunk
1447
+
1448
+ # Convert PCM to numpy array
1449
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
1450
+
1451
+ # Resample from 24kHz (TTS output) to our sample rate if needed
1452
+ if len(audio_array) > 0:
1453
+ # Split audio into chunks and send
1454
+ chunk_size = 480 # Match our frame size
1455
+ for i in range(0, len(audio_array), chunk_size):
1456
+ chunk = audio_array[i:i + chunk_size]
1457
+ if len(chunk) < chunk_size:
1458
+ # Pad the last chunk if necessary
1459
+ chunk = np.pad(chunk, (0, chunk_size - len(chunk)), 'constant')
1460
+
1461
+ await self.output_queue.put((SAMPLE_RATE, chunk.reshape(1, -1)))
1462
+
1463
+ # Send transcript event
1464
+ output_data = {
1465
+ "event": type('Event', (), {
1466
+ 'transcript': f"{user_text} โ†’ {translated_text}"
1467
+ })(),
1468
+ "language": target_lang_name,
1469
+ "mode": "interpretation"
1470
+ }
1471
+ await self.output_queue.put(AdditionalOutputs(output_data))
1472
+
1473
+ except Exception as e:
1474
+ print(f"[INTERPRETATION] Error: {e}")
1475
+ # Send error message to client
1476
+ error_data = {
1477
+ "event": type('Event', (), {
1478
+ 'transcript': f"ํ†ต์—ญ ์˜ค๋ฅ˜: {str(e)}"
1479
+ })(),
1480
+ "language": "",
1481
+ "mode": "error"
1482
+ }
1483
+ await self.output_queue.put(AdditionalOutputs(error_data))
1484
+ finally:
1485
+ # Clear the audio buffer
1486
+ self.audio_buffer = []
1487
+ self.is_recording = False
1488
+ self.silence_frames = 0
1489
 
1490
  def get_translation_instructions(self):
1491
  """Get instructions for translation based on target language"""
 
1500
  )
1501
 
1502
  async def start_up(self):
1503
+ """Connect to realtime API or setup interpretation mode"""
1504
  # First check if we have the most recent settings
1505
  if connection_settings:
1506
  recent_ids = sorted(connection_settings.keys(),
 
1523
  print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
1524
  f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
1525
  f"interpretation_language={self.interpretation_language}")
1526
+
1527
  self.client = openai.AsyncOpenAI()
1528
 
1529
+ # If in interpretation mode, don't connect to Realtime API
1530
+ if self.interpretation_mode:
1531
+ print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
1532
+ print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
1533
+ # Just keep the handler ready to process audio
1534
+ return
1535
+
1536
+ # Normal mode - connect to Realtime API
1537
  # Define the web search function
1538
  tools = []
1539
  base_instructions = self.system_prompt or "You are a helpful assistant."
1540
 
1541
+ # Add translation instructions if language is selected
1542
+ translation_instructions = self.get_translation_instructions()
1543
+
1544
+ if self.web_search_enabled and self.search_client:
1545
+ tools = [{
1546
+ "type": "function",
1547
+ "function": {
1548
+ "name": "web_search",
1549
+ "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
1550
+ "parameters": {
1551
+ "type": "object",
1552
+ "properties": {
1553
+ "query": {
1554
+ "type": "string",
1555
+ "description": "The search query"
1556
+ }
1557
+ },
1558
+ "required": ["query"]
1559
+ }
1560
+ }
1561
+ }]
1562
+ print("Web search function added to tools")
1563
+
1564
+ search_instructions = (
1565
+ "\n\nYou have web search capabilities. "
1566
+ "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
1567
+ "- Weather (๋‚ ์”จ, ๊ธฐ์˜จ, ๋น„, ๋ˆˆ)\n"
1568
+ "- News (๋‰ด์Šค, ์†Œ์‹)\n"
1569
+ "- Current events (ํ˜„์žฌ, ์ตœ๊ทผ, ์˜ค๋Š˜, ์ง€๊ธˆ)\n"
1570
+ "- Prices (๊ฐ€๊ฒฉ, ํ™˜์œจ, ์ฃผ๊ฐ€)\n"
1571
+ "- Sports scores or results\n"
1572
+ "- Any question about 2024 or 2025\n"
1573
+ "- Any time-sensitive information\n\n"
1574
+ "When in doubt, USE web_search. It's better to search and provide accurate information "
1575
+ "than to guess or use outdated information."
1576
  )
1577
+ instructions = base_instructions + search_instructions + translation_instructions
 
 
 
1578
  else:
1579
+ instructions = base_instructions + translation_instructions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1580
 
1581
  async with self.client.beta.realtime.connect(
1582
  model="gpt-4o-mini-realtime-preview-2024-12-17"
1583
  ) as conn:
1584
  # Update session with tools
1585
  session_update = {
1586
+ "turn_detection": {"type": "server_vad"},
 
 
 
 
 
1587
  "instructions": instructions,
1588
  "tools": tools,
1589
+ "tool_choice": "auto" if tools else "none"
 
 
1590
  }
1591
 
1592
+ # Add voice setting if target language is selected
1593
+ if self.target_language:
1594
+ # Map languages to appropriate voices
1595
+ voice_map = {
1596
+ "en": "alloy",
1597
+ "es": "nova",
1598
+ "fr": "nova",
1599
+ "de": "nova",
1600
+ "ja": "nova",
1601
+ "zh": "nova",
1602
+ # Default to alloy for other languages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1603
  }
1604
+ session_update["voice"] = voice_map.get(self.target_language, "alloy")
 
 
 
 
 
 
 
1605
 
1606
  await conn.session.update(session=session_update)
1607
  self.connection = conn
1608
+ print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
 
1609
 
1610
  async for event in self.connection:
 
 
 
 
 
1611
  # Debug logging for function calls
1612
  if event.type.startswith("response.function_call"):
1613
  print(f"Function event: {event.type}")
 
1615
  if event.type == "response.audio_transcript.done":
1616
  output_data = {
1617
  "event": event,
1618
+ "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
 
 
 
 
1619
  }
1620
  await self.output_queue.put(AdditionalOutputs(output_data))
1621
 
 
1676
  self.current_call_id = None
1677
 
1678
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
1679
+ if self.interpretation_mode:
1680
+ # In interpretation mode, buffer audio and process with Whisper
 
1681
  _, array = frame
1682
  array = array.squeeze()
1683
+
1684
+ # Simple voice activity detection
1685
+ audio_level = np.abs(array).mean()
1686
+
1687
+ if audio_level > 300: # Lowered threshold for better detection
1688
+ self.is_recording = True
1689
+ self.silence_frames = 0
1690
+ self.audio_buffer.append(array)
1691
+ if len(self.audio_buffer) % 10 == 0: # Log every 10 frames
1692
+ print(f"[INTERPRETATION] Recording... buffer size: {len(self.audio_buffer)}, level: {audio_level:.1f}")
1693
+ elif self.is_recording:
1694
+ self.silence_frames += 1
1695
+ self.audio_buffer.append(array)
1696
+
1697
+ # If we've had enough silence, process the audio
1698
+ if self.silence_frames > self.silence_threshold:
1699
+ print(f"[INTERPRETATION] Silence detected, processing {len(self.audio_buffer)} frames")
1700
+ await self.process_interpretation()
1701
+ else:
1702
+ # Normal mode - use Realtime API
1703
+ if not self.connection:
1704
+ return
1705
+ try:
1706
+ _, array = frame
1707
+ array = array.squeeze()
1708
+ audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
1709
+ await self.connection.input_audio_buffer.append(audio=audio_message)
1710
+ except Exception as e:
1711
+ print(f"Error in receive: {e}")
1712
+ # Connection might be closed, ignore the error
1713
 
1714
  async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
1715
+ # In interpretation mode, check if we need to process buffered audio
1716
+ if self.interpretation_mode and self.is_recording and self.silence_frames > self.silence_threshold:
1717
+ await self.process_interpretation()
1718
+
1719
  item = await wait_for_item(self.output_queue)
1720
 
1721
  # Check if it's a dict with text message
 
1726
  return item
1727
 
1728
  async def shutdown(self) -> None:
1729
+ if self.interpretation_mode:
1730
+ # Clean up interpretation mode
1731
+ self.audio_buffer = []
1732
+ self.is_recording = False
1733
+ print("[INTERPRETATION MODE] Shutdown complete")
1734
+ else:
1735
+ # Normal mode - close Realtime API connection
1736
+ if self.connection:
1737
+ await self.connection.close()
1738
+ self.connection = None
1739
 
1740
 
1741
  # Create initial handler instance