Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
|
|
1 |
import base64
|
2 |
-
import re
|
3 |
import json
|
4 |
from pathlib import Path
|
5 |
import os
|
@@ -1341,75 +1341,38 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
|
|
1341 |
|
1342 |
|
1343 |
class OpenAIHandler(AsyncStreamHandler):
|
1344 |
-
|
1345 |
-
|
1346 |
-
|
1347 |
-
|
1348 |
-
|
1349 |
-
|
1350 |
-
|
1351 |
-
|
1352 |
-
...
|
1353 |
-
|
1354 |
-
# โก ์ ํฌํผ ํจ์ ์ถ๊ฐ
|
1355 |
-
async def _translate_text(self, source_ko: str) -> str:
|
1356 |
-
"""
|
1357 |
-
Korean โ self.interpretation_language ๋ฒ์ญ ์ ์ฉ.
|
1358 |
-
- ํจ์-์ฝ ํ์ฉ์ผ๋ก โ์์ ๋ฒ์ญ๋ฌธโ๋ง ๋ณด์ฅ.
|
1359 |
-
"""
|
1360 |
-
tgt_code = self.interpretation_language
|
1361 |
-
tgt_name = SUPPORTED_LANGUAGES.get(tgt_code, tgt_code)
|
1362 |
-
|
1363 |
-
translate_tool = [{
|
1364 |
-
"type": "function",
|
1365 |
-
"function": {
|
1366 |
-
"name": "return_translation",
|
1367 |
-
"description": f"Return ONLY the {tgt_name} translation of the given Korean sentence.",
|
1368 |
-
"parameters": {
|
1369 |
-
"type": "object",
|
1370 |
-
"properties": {
|
1371 |
-
"translation": {"type": "string"}
|
1372 |
-
},
|
1373 |
-
"required": ["translation"]
|
1374 |
-
}
|
1375 |
-
}
|
1376 |
-
}]
|
1377 |
-
|
1378 |
-
resp = await self.client.chat.completions.create(
|
1379 |
-
model = "gpt-4o-mini",
|
1380 |
-
tools = translate_tool,
|
1381 |
-
tool_choice = "auto",
|
1382 |
-
messages = [
|
1383 |
-
{"role": "system",
|
1384 |
-
"content": f"You are a Koreanโ{tgt_name} translator. Respond via function call ONLY."},
|
1385 |
-
{"role": "user", "content": source_ko}
|
1386 |
-
],
|
1387 |
-
temperature = 0.0,
|
1388 |
)
|
1389 |
-
|
1390 |
-
|
1391 |
-
|
1392 |
-
|
1393 |
-
|
1394 |
-
|
1395 |
-
|
1396 |
-
|
1397 |
-
|
1398 |
-
|
1399 |
-
|
1400 |
-
|
1401 |
-
|
1402 |
-
|
1403 |
-
|
1404 |
-
|
1405 |
-
|
1406 |
-
|
1407 |
-
|
1408 |
-
|
1409 |
-
|
1410 |
-
|
1411 |
-
|
1412 |
-
|
1413 |
|
1414 |
def copy(self):
|
1415 |
# Get the most recent settings
|
@@ -1711,20 +1674,9 @@ Now translate the Korean text to {target_lang_name}. Output ONLY the translation
|
|
1711 |
if self.interpretation_mode:
|
1712 |
print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
|
1713 |
print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
|
1714 |
-
|
1715 |
-
|
1716 |
-
|
1717 |
-
# The receive() and emit() methods will handle audio processing
|
1718 |
-
while True:
|
1719 |
-
try:
|
1720 |
-
await asyncio.sleep(1.0)
|
1721 |
-
# Just keep alive
|
1722 |
-
if not self.interpretation_mode:
|
1723 |
-
print("[INTERPRETATION MODE] Mode disabled, exiting loop")
|
1724 |
-
break
|
1725 |
-
except asyncio.CancelledError:
|
1726 |
-
print("[INTERPRETATION MODE] Cancelled")
|
1727 |
-
break
|
1728 |
return
|
1729 |
|
1730 |
# Normal mode - connect to Realtime API
|
@@ -1993,12 +1945,6 @@ RULES:
|
|
1993 |
_, array = frame
|
1994 |
array = array.squeeze()
|
1995 |
|
1996 |
-
# Make sure we're not connected to Realtime API
|
1997 |
-
if self.connection:
|
1998 |
-
print("[INTERPRETATION] WARNING: Connection exists in interpretation mode! Closing it.")
|
1999 |
-
await self.connection.close()
|
2000 |
-
self.connection = None
|
2001 |
-
|
2002 |
# Simple voice activity detection
|
2003 |
audio_level = np.abs(array).mean()
|
2004 |
|
@@ -2020,7 +1966,6 @@ RULES:
|
|
2020 |
else:
|
2021 |
# Normal mode - use Realtime API
|
2022 |
if not self.connection:
|
2023 |
-
print("[NORMAL MODE] No connection in receive()")
|
2024 |
return
|
2025 |
try:
|
2026 |
_, array = frame
|
@@ -2034,23 +1979,9 @@ RULES:
|
|
2034 |
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
|
2035 |
# In interpretation mode, we need to keep checking for audio
|
2036 |
if self.interpretation_mode:
|
2037 |
-
# Make absolutely sure no Realtime API connection
|
2038 |
-
if self.connection:
|
2039 |
-
print("[INTERPRETATION] WARNING: Connection found in emit()! Closing.")
|
2040 |
-
try:
|
2041 |
-
await self.connection.close()
|
2042 |
-
except:
|
2043 |
-
pass
|
2044 |
-
self.connection = None
|
2045 |
-
|
2046 |
# Use a timeout to prevent blocking forever
|
2047 |
try:
|
2048 |
item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.1)
|
2049 |
-
# Log what we're emitting
|
2050 |
-
if isinstance(item, tuple):
|
2051 |
-
print(f"[INTERPRETATION] Emitting audio chunk")
|
2052 |
-
elif isinstance(item, AdditionalOutputs):
|
2053 |
-
print(f"[INTERPRETATION] Emitting transcript/event")
|
2054 |
return item
|
2055 |
except asyncio.TimeoutError:
|
2056 |
return None
|
|
|
1 |
+
import asyncio
|
2 |
import base64
|
|
|
3 |
import json
|
4 |
from pathlib import Path
|
5 |
import os
|
|
|
1341 |
|
1342 |
|
1343 |
class OpenAIHandler(AsyncStreamHandler):
|
1344 |
+
def __init__(self, web_search_enabled: bool = False, target_language: str = "",
|
1345 |
+
system_prompt: str = "", webrtc_id: str = None,
|
1346 |
+
interpretation_mode: bool = False, interpretation_language: str = "") -> None:
|
1347 |
+
super().__init__(
|
1348 |
+
expected_layout="mono",
|
1349 |
+
output_sample_rate=SAMPLE_RATE,
|
1350 |
+
output_frame_size=480,
|
1351 |
+
input_sample_rate=SAMPLE_RATE,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1352 |
)
|
1353 |
+
self.connection = None
|
1354 |
+
self.output_queue = asyncio.Queue()
|
1355 |
+
self.search_client = search_client
|
1356 |
+
self.function_call_in_progress = False
|
1357 |
+
self.current_function_args = ""
|
1358 |
+
self.current_call_id = None
|
1359 |
+
self.webrtc_id = webrtc_id
|
1360 |
+
self.web_search_enabled = web_search_enabled
|
1361 |
+
self.target_language = target_language
|
1362 |
+
self.system_prompt = system_prompt
|
1363 |
+
self.interpretation_mode = interpretation_mode
|
1364 |
+
self.interpretation_language = interpretation_language
|
1365 |
+
|
1366 |
+
# For interpretation mode
|
1367 |
+
self.audio_buffer = []
|
1368 |
+
self.is_recording = False
|
1369 |
+
self.silence_frames = 0
|
1370 |
+
self.silence_threshold = 20 # Reduced for faster response (20 frames = ~0.4 seconds)
|
1371 |
+
self.min_audio_length = 10 # Minimum frames to consider as speech
|
1372 |
+
|
1373 |
+
print(f"Handler created with web_search_enabled={web_search_enabled}, "
|
1374 |
+
f"target_language={target_language}, webrtc_id={webrtc_id}, "
|
1375 |
+
f"interpretation_mode={interpretation_mode}, interpretation_language={interpretation_language}")
|
|
|
1376 |
|
1377 |
def copy(self):
|
1378 |
# Get the most recent settings
|
|
|
1674 |
if self.interpretation_mode:
|
1675 |
print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
|
1676 |
print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
|
1677 |
+
# Just keep the handler ready to process audio
|
1678 |
+
# Don't use infinite loop here - the handler will be called by the framework
|
1679 |
+
self.client = openai.AsyncOpenAI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1680 |
return
|
1681 |
|
1682 |
# Normal mode - connect to Realtime API
|
|
|
1945 |
_, array = frame
|
1946 |
array = array.squeeze()
|
1947 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1948 |
# Simple voice activity detection
|
1949 |
audio_level = np.abs(array).mean()
|
1950 |
|
|
|
1966 |
else:
|
1967 |
# Normal mode - use Realtime API
|
1968 |
if not self.connection:
|
|
|
1969 |
return
|
1970 |
try:
|
1971 |
_, array = frame
|
|
|
1979 |
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
|
1980 |
# In interpretation mode, we need to keep checking for audio
|
1981 |
if self.interpretation_mode:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1982 |
# Use a timeout to prevent blocking forever
|
1983 |
try:
|
1984 |
item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.1)
|
|
|
|
|
|
|
|
|
|
|
1985 |
return item
|
1986 |
except asyncio.TimeoutError:
|
1987 |
return None
|