Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -861,8 +861,8 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
861 |
console.log('Selected language:', selectedLanguage);
|
862 |
});
|
863 |
|
864 |
-
// Interpretation mode toggle
|
865 |
-
interpretationToggle.addEventListener('click', () => {
|
866 |
if (!interpretationMode) {
|
867 |
// Turning ON interpretation mode
|
868 |
interpretationLanguagesContainer.style.display = 'block';
|
@@ -891,15 +891,22 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
891 |
textInput.placeholder = '통역 모드에서는 텍스트 입력이 지원되지 않습니다';
|
892 |
sendButton.style.display = 'none';
|
893 |
|
894 |
-
console.log('Interpretation mode enabled with languages:', interpretationLanguages);
|
895 |
|
896 |
-
// If connected, restart
|
897 |
if (peerConnection && peerConnection.connectionState === 'connected') {
|
|
|
898 |
showError('통역 모드 설정을 적용하기 위해 연결을 다시 시작합니다.');
|
|
|
|
|
899 |
stop();
|
900 |
-
|
901 |
-
|
902 |
-
|
|
|
|
|
|
|
|
|
903 |
}
|
904 |
} else {
|
905 |
// Turning OFF interpretation mode
|
@@ -916,19 +923,26 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
916 |
textInput.placeholder = '텍스트 메시지를 입력하세요...';
|
917 |
sendButton.style.display = 'block';
|
918 |
|
919 |
-
console.log('Interpretation mode disabled');
|
920 |
|
921 |
// If connected, restart to apply normal mode
|
922 |
if (peerConnection && peerConnection.connectionState === 'connected') {
|
|
|
923 |
showError('일반 모드로 전환하기 위해 연결을 다시 시작합니다.');
|
|
|
|
|
924 |
stop();
|
925 |
-
|
926 |
-
|
927 |
-
|
|
|
|
|
|
|
|
|
928 |
}
|
929 |
}
|
930 |
|
931 |
-
console.log('
|
932 |
});
|
933 |
|
934 |
// System prompt update
|
@@ -1284,16 +1298,33 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
1284 |
}
|
1285 |
|
1286 |
function stop() {
|
|
|
|
|
|
|
1287 |
if (animationFrame) {
|
1288 |
cancelAnimationFrame(animationFrame);
|
|
|
1289 |
}
|
|
|
|
|
1290 |
if (audioContext) {
|
1291 |
audioContext.close();
|
1292 |
audioContext = null;
|
1293 |
analyser = null;
|
1294 |
audioSource = null;
|
1295 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1296 |
if (peerConnection) {
|
|
|
|
|
|
|
1297 |
if (peerConnection.getTransceivers) {
|
1298 |
peerConnection.getTransceivers().forEach(transceiver => {
|
1299 |
if (transceiver.stop) {
|
@@ -1301,17 +1332,46 @@ HTML_CONTENT = """<!DOCTYPE html>
|
|
1301 |
}
|
1302 |
});
|
1303 |
}
|
|
|
|
|
1304 |
if (peerConnection.getSenders) {
|
1305 |
peerConnection.getSenders().forEach(sender => {
|
1306 |
-
if (sender.track
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1307 |
});
|
1308 |
}
|
1309 |
-
|
|
|
1310 |
peerConnection.close();
|
|
|
|
|
|
|
|
|
|
|
1311 |
}
|
1312 |
-
|
1313 |
-
|
1314 |
audioLevel = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1315 |
}
|
1316 |
startButton.addEventListener('click', () => {
|
1317 |
console.log('clicked');
|
@@ -1534,9 +1594,8 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1534 |
self.silence_threshold = 20 # Reduced for faster response (20 frames = ~0.4 seconds)
|
1535 |
self.min_audio_length = 10 # Minimum frames to consider as speech
|
1536 |
|
1537 |
-
print(f"Handler created with
|
1538 |
-
f"
|
1539 |
-
f"interpretation_mode={interpretation_mode}, interpretation_languages={interpretation_languages}")
|
1540 |
|
1541 |
def copy(self):
|
1542 |
# Get the most recent settings
|
@@ -1548,6 +1607,12 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1548 |
if recent_ids:
|
1549 |
recent_id = recent_ids[0]
|
1550 |
settings = connection_settings[recent_id]
|
|
|
|
|
|
|
|
|
|
|
|
|
1551 |
return OpenAIHandler(
|
1552 |
web_search_enabled=settings.get('web_search_enabled', False),
|
1553 |
target_language=settings.get('target_language', ''),
|
@@ -1557,7 +1622,7 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1557 |
interpretation_languages=settings.get('interpretation_languages', [])
|
1558 |
)
|
1559 |
|
1560 |
-
print(f"
|
1561 |
return OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
|
1562 |
|
1563 |
async def search_web(self, query: str) -> str:
|
@@ -1772,52 +1837,47 @@ Direct translation to {target_lang_name}:"""
|
|
1772 |
async def start_up(self):
|
1773 |
"""Connect to realtime API or setup interpretation mode"""
|
1774 |
# First check if we have the most recent settings
|
1775 |
-
if connection_settings:
|
1776 |
-
|
1777 |
-
|
1778 |
-
reverse=True)
|
1779 |
-
if recent_ids:
|
1780 |
-
recent_id = recent_ids[0]
|
1781 |
-
settings = connection_settings[recent_id]
|
1782 |
self.web_search_enabled = settings.get('web_search_enabled', False)
|
1783 |
self.target_language = settings.get('target_language', '')
|
1784 |
self.system_prompt = settings.get('system_prompt', '')
|
1785 |
self.interpretation_mode = settings.get('interpretation_mode', False)
|
1786 |
self.interpretation_languages = settings.get('interpretation_languages', [])
|
1787 |
-
|
1788 |
-
print(f"
|
1789 |
-
|
1790 |
-
|
1791 |
-
print(f"Handler interpretation settings: mode={self.interpretation_mode}, languages={self.interpretation_languages}")
|
1792 |
|
1793 |
-
print(f"Starting
|
1794 |
-
f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
|
1795 |
-
f"interpretation_languages={self.interpretation_languages}")
|
1796 |
|
1797 |
self.client = openai.AsyncOpenAI()
|
1798 |
|
1799 |
# If in interpretation mode, don't connect to Realtime API
|
1800 |
if self.interpretation_mode:
|
1801 |
-
print(f"[INTERPRETATION MODE] Active -
|
|
|
1802 |
print(f"[INTERPRETATION MODE] Target languages: {self.interpretation_languages}")
|
|
|
|
|
1803 |
# Just keep the handler ready to process audio
|
1804 |
-
# Keep the emit loop running for interpretation results
|
1805 |
-
asyncio.create_task(self._interpretation_loop())
|
1806 |
return
|
1807 |
|
1808 |
# Normal mode - connect to Realtime API
|
1809 |
-
|
1810 |
-
|
1811 |
-
|
1812 |
-
|
|
|
|
|
|
|
|
|
|
|
1813 |
|
1814 |
-
#
|
1815 |
-
if self.target_language:
|
1816 |
-
|
1817 |
-
|
1818 |
-
# Use the target language for the system prompt itself
|
1819 |
-
if self.target_language == "en":
|
1820 |
-
translation_instructions = """
|
1821 |
YOU ARE AN ENGLISH-ONLY ASSISTANT.
|
1822 |
|
1823 |
ABSOLUTE RULES:
|
@@ -1831,11 +1891,11 @@ YOUR LANGUAGE MODE: ENGLISH ONLY
|
|
1831 |
DO NOT USE: 안녕하세요, 감사합니다, or any Korean
|
1832 |
ALWAYS USE: Hello, Thank you, and English words only
|
1833 |
"""
|
1834 |
-
|
1835 |
-
|
1836 |
-
|
1837 |
-
|
1838 |
-
|
1839 |
あなたは日本語のみを話すアシスタントです。
|
1840 |
|
1841 |
絶対的なルール:
|
@@ -1849,10 +1909,10 @@ ALWAYS USE: Hello, Thank you, and English words only
|
|
1849 |
使用禁止:안녕하세요、감사합니다、韓国語全般
|
1850 |
必ず使用:こんにちは、ありがとうございます、日本語のみ
|
1851 |
"""
|
1852 |
-
|
1853 |
-
|
1854 |
-
|
1855 |
-
|
1856 |
你是一个只说中文的助手。
|
1857 |
|
1858 |
绝对规则:
|
@@ -1866,10 +1926,10 @@ ALWAYS USE: Hello, Thank you, and English words only
|
|
1866 |
禁止使用:안녕하세요、감사합니다、任何韩语
|
1867 |
必须使用:你好、谢谢、只用中文
|
1868 |
"""
|
1869 |
-
|
1870 |
-
|
1871 |
-
|
1872 |
-
|
1873 |
ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
|
1874 |
|
1875 |
REGLAS ABSOLUTAS:
|
@@ -1883,9 +1943,9 @@ MODO DE IDIOMA: SOLO ESPAÑOL
|
|
1883 |
NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
|
1884 |
SIEMPRE USAR: Hola, Gracias, y solo palabras en español
|
1885 |
"""
|
1886 |
-
|
1887 |
-
|
1888 |
-
|
1889 |
YOU MUST ONLY SPEAK {language_name.upper()}.
|
1890 |
|
1891 |
RULES:
|
@@ -1893,181 +1953,182 @@ RULES:
|
|
1893 |
2. Never use Korean
|
1894 |
3. Always respond in {language_name}
|
1895 |
"""
|
1896 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1897 |
else:
|
1898 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1899 |
|
1900 |
-
|
1901 |
-
|
1902 |
-
|
1903 |
-
|
1904 |
-
|
1905 |
-
|
1906 |
-
|
1907 |
-
|
1908 |
-
|
1909 |
-
|
1910 |
-
|
1911 |
-
|
1912 |
-
|
1913 |
-
|
1914 |
-
"required": ["query"]
|
1915 |
-
}
|
1916 |
-
}
|
1917 |
-
}]
|
1918 |
-
print("Web search function added to tools")
|
1919 |
|
1920 |
-
|
1921 |
-
|
1922 |
-
"IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
|
1923 |
-
"- Weather (날씨, 기온, 비, 눈)\n"
|
1924 |
-
"- News (뉴스, 소식)\n"
|
1925 |
-
"- Current events (현재, 최근, 오늘, 지금)\n"
|
1926 |
-
"- Prices (가격, 환율, 주가)\n"
|
1927 |
-
"- Sports scores or results\n"
|
1928 |
-
"- Any question about 2024 or 2025\n"
|
1929 |
-
"- Any time-sensitive information\n\n"
|
1930 |
-
"When in doubt, USE web_search. It's better to search and provide accurate information "
|
1931 |
-
"than to guess or use outdated information."
|
1932 |
-
)
|
1933 |
|
1934 |
-
#
|
1935 |
-
|
1936 |
-
|
1937 |
-
|
1938 |
-
|
1939 |
-
|
1940 |
-
|
1941 |
-
|
1942 |
-
if translation_instructions:
|
1943 |
-
instructions = translation_instructions
|
1944 |
-
else:
|
1945 |
-
instructions = base_instructions
|
1946 |
|
1947 |
-
|
1948 |
-
|
1949 |
-
print(f"
|
1950 |
-
print(f"[NORMAL MODE] Target language: {self.target_language}")
|
1951 |
|
1952 |
-
async
|
1953 |
-
|
1954 |
-
|
1955 |
-
|
1956 |
-
session_update = {
|
1957 |
-
"turn_detection": {"type": "server_vad"},
|
1958 |
-
"instructions": instructions,
|
1959 |
-
"tools": tools,
|
1960 |
-
"tool_choice": "auto" if tools else "none",
|
1961 |
-
"temperature": 0.7,
|
1962 |
-
"max_response_output_tokens": 4096,
|
1963 |
-
"modalities": ["text", "audio"],
|
1964 |
-
"voice": "alloy" # Default voice
|
1965 |
-
}
|
1966 |
|
1967 |
-
|
1968 |
-
|
1969 |
-
|
1970 |
-
# 1. Use voice that's known to work well with the language
|
1971 |
-
voice_map = {
|
1972 |
-
"en": "nova", # Nova has clearer English
|
1973 |
-
"es": "nova", # Nova works for Spanish
|
1974 |
-
"fr": "shimmer", # Shimmer for French
|
1975 |
-
"de": "echo", # Echo for German
|
1976 |
-
"ja": "alloy", # Alloy can do Japanese
|
1977 |
-
"zh": "alloy", # Alloy can do Chinese
|
1978 |
-
"ko": "nova", # Nova for Korean
|
1979 |
-
}
|
1980 |
-
session_update["voice"] = voice_map.get(self.target_language, "nova")
|
1981 |
-
|
1982 |
-
# 2. Add language to modalities (experimental)
|
1983 |
-
session_update["modalities"] = ["text", "audio"]
|
1984 |
-
|
1985 |
-
# 3. Set output format
|
1986 |
-
session_update["output_audio_format"] = "pcm16"
|
1987 |
|
1988 |
-
|
1989 |
-
|
1990 |
-
|
1991 |
-
|
1992 |
-
|
1993 |
|
1994 |
-
|
1995 |
-
|
1996 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1997 |
|
1998 |
-
|
1999 |
-
|
2000 |
-
|
2001 |
-
|
2002 |
-
|
2003 |
-
|
2004 |
-
|
2005 |
-
|
2006 |
-
|
2007 |
-
|
2008 |
-
|
2009 |
-
|
2010 |
-
|
2011 |
-
|
2012 |
-
|
2013 |
-
|
2014 |
-
|
2015 |
-
|
2016 |
-
|
2017 |
-
|
2018 |
-
|
2019 |
-
|
2020 |
-
)
|
2021 |
-
)
|
2022 |
-
|
2023 |
-
# Handle function calls (only in non-interpretation mode)
|
2024 |
-
elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
|
2025 |
-
print(f"Function call started")
|
2026 |
-
self.function_call_in_progress = True
|
2027 |
-
self.current_function_args = ""
|
2028 |
-
self.current_call_id = getattr(event, 'call_id', None)
|
2029 |
-
|
2030 |
-
elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
|
2031 |
-
if self.function_call_in_progress:
|
2032 |
-
self.current_function_args += event.delta
|
2033 |
-
|
2034 |
-
elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
|
2035 |
-
if self.function_call_in_progress:
|
2036 |
-
print(f"Function call done, args: {self.current_function_args}")
|
2037 |
-
try:
|
2038 |
-
args = json.loads(self.current_function_args)
|
2039 |
-
query = args.get("query", "")
|
2040 |
-
|
2041 |
-
# Emit search event to client
|
2042 |
-
await self.output_queue.put(AdditionalOutputs({
|
2043 |
-
"type": "search",
|
2044 |
-
"query": query
|
2045 |
-
}))
|
2046 |
-
|
2047 |
-
# Perform the search
|
2048 |
-
search_results = await self.search_web(query)
|
2049 |
-
print(f"Search results length: {len(search_results)}")
|
2050 |
-
|
2051 |
-
# Send function result back to the model
|
2052 |
-
if self.connection and self.current_call_id:
|
2053 |
-
await self.connection.conversation.item.create(
|
2054 |
-
item={
|
2055 |
-
"type": "function_call_output",
|
2056 |
-
"call_id": self.current_call_id,
|
2057 |
-
"output": search_results
|
2058 |
-
}
|
2059 |
-
)
|
2060 |
-
await self.connection.response.create()
|
2061 |
|
2062 |
-
|
2063 |
-
|
2064 |
-
|
2065 |
-
|
2066 |
-
|
2067 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2068 |
|
2069 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
2070 |
if self.interpretation_mode:
|
|
|
2071 |
# In interpretation mode, buffer audio and process with Whisper
|
2072 |
_, array = frame
|
2073 |
array = array.squeeze()
|
@@ -2093,6 +2154,7 @@ RULES:
|
|
2093 |
else:
|
2094 |
# Normal mode - use Realtime API
|
2095 |
if not self.connection:
|
|
|
2096 |
return
|
2097 |
try:
|
2098 |
_, array = frame
|
@@ -2101,16 +2163,16 @@ RULES:
|
|
2101 |
await self.connection.input_audio_buffer.append(audio=audio_message)
|
2102 |
except Exception as e:
|
2103 |
print(f"Error in receive: {e}")
|
2104 |
-
# Connection might be closed, ignore the error
|
2105 |
|
2106 |
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
|
2107 |
-
# In interpretation mode, we need to keep checking for
|
2108 |
if self.interpretation_mode:
|
2109 |
-
# Use a timeout to prevent blocking
|
2110 |
try:
|
2111 |
-
item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.
|
2112 |
return item
|
2113 |
except asyncio.TimeoutError:
|
|
|
2114 |
return None
|
2115 |
else:
|
2116 |
# Normal mode
|
@@ -2124,6 +2186,8 @@ RULES:
|
|
2124 |
return item
|
2125 |
|
2126 |
async def shutdown(self) -> None:
|
|
|
|
|
2127 |
if self.interpretation_mode:
|
2128 |
# Clean up interpretation mode
|
2129 |
self.audio_buffer = []
|
@@ -2134,6 +2198,7 @@ RULES:
|
|
2134 |
if self.connection:
|
2135 |
await self.connection.close()
|
2136 |
self.connection = None
|
|
|
2137 |
|
2138 |
|
2139 |
# Create initial handler instance
|
@@ -2173,9 +2238,11 @@ async def custom_offer(request: Request):
|
|
2173 |
interpretation_mode = body.get("interpretation_mode", False)
|
2174 |
interpretation_languages = body.get("interpretation_languages", [])
|
2175 |
|
2176 |
-
print(f"
|
2177 |
-
|
2178 |
-
|
|
|
|
|
2179 |
|
2180 |
# Store settings with timestamp
|
2181 |
if webrtc_id:
|
@@ -2187,6 +2254,9 @@ async def custom_offer(request: Request):
|
|
2187 |
'interpretation_languages': interpretation_languages,
|
2188 |
'timestamp': asyncio.get_event_loop().time()
|
2189 |
}
|
|
|
|
|
|
|
2190 |
|
2191 |
# Remove our custom route temporarily
|
2192 |
custom_route = None
|
@@ -2196,12 +2266,15 @@ async def custom_offer(request: Request):
|
|
2196 |
break
|
2197 |
|
2198 |
# Forward to stream's offer handler
|
|
|
2199 |
response = await stream.offer(body)
|
2200 |
|
2201 |
# Re-add our custom route
|
2202 |
if custom_route:
|
2203 |
app.routes.insert(0, custom_route)
|
2204 |
|
|
|
|
|
2205 |
return response
|
2206 |
|
2207 |
|
|
|
861 |
console.log('Selected language:', selectedLanguage);
|
862 |
});
|
863 |
|
864 |
+
// Interpretation mode toggle - 수정된 버전
|
865 |
+
interpretationToggle.addEventListener('click', async () => {
|
866 |
if (!interpretationMode) {
|
867 |
// Turning ON interpretation mode
|
868 |
interpretationLanguagesContainer.style.display = 'block';
|
|
|
891 |
textInput.placeholder = '통역 모드에서는 텍스트 입력이 지원되지 않습니다';
|
892 |
sendButton.style.display = 'none';
|
893 |
|
894 |
+
console.log('[FRONTEND] Interpretation mode enabled with languages:', interpretationLanguages);
|
895 |
|
896 |
+
// If connected, stop and restart with interpretation mode
|
897 |
if (peerConnection && peerConnection.connectionState === 'connected') {
|
898 |
+
console.log('[FRONTEND] Stopping current connection for interpretation mode');
|
899 |
showError('통역 모드 설정을 적용하기 위해 연결을 다시 시작합니다.');
|
900 |
+
|
901 |
+
// Force stop the connection
|
902 |
stop();
|
903 |
+
|
904 |
+
// Wait a bit longer to ensure cleanup
|
905 |
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
906 |
+
|
907 |
+
// Start new connection with interpretation mode
|
908 |
+
console.log('[FRONTEND] Starting new connection with interpretation mode');
|
909 |
+
setupWebRTC();
|
910 |
}
|
911 |
} else {
|
912 |
// Turning OFF interpretation mode
|
|
|
923 |
textInput.placeholder = '텍스트 메시지를 입력하세요...';
|
924 |
sendButton.style.display = 'block';
|
925 |
|
926 |
+
console.log('[FRONTEND] Interpretation mode disabled');
|
927 |
|
928 |
// If connected, restart to apply normal mode
|
929 |
if (peerConnection && peerConnection.connectionState === 'connected') {
|
930 |
+
console.log('[FRONTEND] Stopping current connection for normal mode');
|
931 |
showError('일반 모드로 전환하기 위해 연결을 다시 시작합니다.');
|
932 |
+
|
933 |
+
// Force stop the connection
|
934 |
stop();
|
935 |
+
|
936 |
+
// Wait a bit longer to ensure cleanup
|
937 |
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
938 |
+
|
939 |
+
// Start new connection with normal mode
|
940 |
+
console.log('[FRONTEND] Starting new connection with normal mode');
|
941 |
+
setupWebRTC();
|
942 |
}
|
943 |
}
|
944 |
|
945 |
+
console.log('[FRONTEND] Final interpretation mode:', interpretationMode);
|
946 |
});
|
947 |
|
948 |
// System prompt update
|
|
|
1298 |
}
|
1299 |
|
1300 |
function stop() {
|
1301 |
+
console.log('[STOP] Stopping connection...');
|
1302 |
+
|
1303 |
+
// Cancel animation frame first
|
1304 |
if (animationFrame) {
|
1305 |
cancelAnimationFrame(animationFrame);
|
1306 |
+
animationFrame = null;
|
1307 |
}
|
1308 |
+
|
1309 |
+
// Close audio context
|
1310 |
if (audioContext) {
|
1311 |
audioContext.close();
|
1312 |
audioContext = null;
|
1313 |
analyser = null;
|
1314 |
audioSource = null;
|
1315 |
}
|
1316 |
+
|
1317 |
+
// Close data channel
|
1318 |
+
if (dataChannel) {
|
1319 |
+
dataChannel.close();
|
1320 |
+
dataChannel = null;
|
1321 |
+
}
|
1322 |
+
|
1323 |
+
// Close peer connection
|
1324 |
if (peerConnection) {
|
1325 |
+
console.log('[STOP] Current connection state:', peerConnection.connectionState);
|
1326 |
+
|
1327 |
+
// Stop all transceivers
|
1328 |
if (peerConnection.getTransceivers) {
|
1329 |
peerConnection.getTransceivers().forEach(transceiver => {
|
1330 |
if (transceiver.stop) {
|
|
|
1332 |
}
|
1333 |
});
|
1334 |
}
|
1335 |
+
|
1336 |
+
// Stop all senders
|
1337 |
if (peerConnection.getSenders) {
|
1338 |
peerConnection.getSenders().forEach(sender => {
|
1339 |
+
if (sender.track) {
|
1340 |
+
sender.track.stop();
|
1341 |
+
}
|
1342 |
+
});
|
1343 |
+
}
|
1344 |
+
|
1345 |
+
// Stop all receivers
|
1346 |
+
if (peerConnection.getReceivers) {
|
1347 |
+
peerConnection.getReceivers().forEach(receiver => {
|
1348 |
+
if (receiver.track) {
|
1349 |
+
receiver.track.stop();
|
1350 |
+
}
|
1351 |
});
|
1352 |
}
|
1353 |
+
|
1354 |
+
// Close the connection
|
1355 |
peerConnection.close();
|
1356 |
+
|
1357 |
+
// Clear the reference
|
1358 |
+
peerConnection = null;
|
1359 |
+
|
1360 |
+
console.log('[STOP] Connection closed');
|
1361 |
}
|
1362 |
+
|
1363 |
+
// Reset audio level
|
1364 |
audioLevel = 0;
|
1365 |
+
isVoiceActive = false;
|
1366 |
+
|
1367 |
+
// Update UI
|
1368 |
+
updateButtonState();
|
1369 |
+
|
1370 |
+
// Clear any existing webrtc_id
|
1371 |
+
if (webrtc_id) {
|
1372 |
+
console.log('[STOP] Clearing webrtc_id:', webrtc_id);
|
1373 |
+
webrtc_id = null;
|
1374 |
+
}
|
1375 |
}
|
1376 |
startButton.addEventListener('click', () => {
|
1377 |
console.log('clicked');
|
|
|
1594 |
self.silence_threshold = 20 # Reduced for faster response (20 frames = ~0.4 seconds)
|
1595 |
self.min_audio_length = 10 # Minimum frames to consider as speech
|
1596 |
|
1597 |
+
print(f"[INIT] Handler created with interpretation_mode={interpretation_mode}, "
|
1598 |
+
f"interpretation_languages={interpretation_languages}")
|
|
|
1599 |
|
1600 |
def copy(self):
|
1601 |
# Get the most recent settings
|
|
|
1607 |
if recent_ids:
|
1608 |
recent_id = recent_ids[0]
|
1609 |
settings = connection_settings[recent_id]
|
1610 |
+
|
1611 |
+
# Log the settings being copied
|
1612 |
+
print(f"[COPY] Copying settings from {recent_id}:")
|
1613 |
+
print(f"[COPY] interpretation_mode={settings.get('interpretation_mode', False)}")
|
1614 |
+
print(f"[COPY] interpretation_languages={settings.get('interpretation_languages', [])}")
|
1615 |
+
|
1616 |
return OpenAIHandler(
|
1617 |
web_search_enabled=settings.get('web_search_enabled', False),
|
1618 |
target_language=settings.get('target_language', ''),
|
|
|
1622 |
interpretation_languages=settings.get('interpretation_languages', [])
|
1623 |
)
|
1624 |
|
1625 |
+
print(f"[COPY] No settings found, creating default handler")
|
1626 |
return OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
|
1627 |
|
1628 |
async def search_web(self, query: str) -> str:
|
|
|
1837 |
async def start_up(self):
|
1838 |
"""Connect to realtime API or setup interpretation mode"""
|
1839 |
# First check if we have the most recent settings
|
1840 |
+
if connection_settings and self.webrtc_id:
|
1841 |
+
if self.webrtc_id in connection_settings:
|
1842 |
+
settings = connection_settings[self.webrtc_id]
|
|
|
|
|
|
|
|
|
1843 |
self.web_search_enabled = settings.get('web_search_enabled', False)
|
1844 |
self.target_language = settings.get('target_language', '')
|
1845 |
self.system_prompt = settings.get('system_prompt', '')
|
1846 |
self.interpretation_mode = settings.get('interpretation_mode', False)
|
1847 |
self.interpretation_languages = settings.get('interpretation_languages', [])
|
1848 |
+
|
1849 |
+
print(f"[START_UP] Updated settings from storage for {self.webrtc_id}")
|
1850 |
+
print(f"[START_UP] interpretation_mode={self.interpretation_mode}")
|
1851 |
+
print(f"[START_UP] interpretation_languages={self.interpretation_languages}")
|
|
|
1852 |
|
1853 |
+
print(f"[START_UP] Starting with interpretation_mode={self.interpretation_mode}")
|
|
|
|
|
1854 |
|
1855 |
self.client = openai.AsyncOpenAI()
|
1856 |
|
1857 |
# If in interpretation mode, don't connect to Realtime API
|
1858 |
if self.interpretation_mode:
|
1859 |
+
print(f"[INTERPRETATION MODE] Active - Skipping Realtime API connection")
|
1860 |
+
print(f"[INTERPRETATION MODE] Using Whisper + GPT-4o-mini (text only)")
|
1861 |
print(f"[INTERPRETATION MODE] Target languages: {self.interpretation_languages}")
|
1862 |
+
|
1863 |
+
# Do NOT connect to Realtime API
|
1864 |
# Just keep the handler ready to process audio
|
|
|
|
|
1865 |
return
|
1866 |
|
1867 |
# Normal mode - connect to Realtime API
|
1868 |
+
print(f"[NORMAL MODE] Connecting to Realtime API...")
|
1869 |
+
|
1870 |
+
# Define the web search function
|
1871 |
+
tools = []
|
1872 |
+
base_instructions = self.system_prompt or "You are a helpful assistant."
|
1873 |
+
|
1874 |
+
# Add translation instructions if language is selected
|
1875 |
+
if self.target_language:
|
1876 |
+
language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
|
1877 |
|
1878 |
+
# Use the target language for the system prompt itself
|
1879 |
+
if self.target_language == "en":
|
1880 |
+
translation_instructions = """
|
|
|
|
|
|
|
|
|
1881 |
YOU ARE AN ENGLISH-ONLY ASSISTANT.
|
1882 |
|
1883 |
ABSOLUTE RULES:
|
|
|
1891 |
DO NOT USE: 안녕하세요, 감사합니다, or any Korean
|
1892 |
ALWAYS USE: Hello, Thank you, and English words only
|
1893 |
"""
|
1894 |
+
# Override base instructions to be in English
|
1895 |
+
base_instructions = "You are a helpful assistant that speaks ONLY English."
|
1896 |
+
|
1897 |
+
elif self.target_language == "ja":
|
1898 |
+
translation_instructions = """
|
1899 |
あなたは日本語のみを話すアシスタントです。
|
1900 |
|
1901 |
絶対的なルール:
|
|
|
1909 |
使用禁止:안녕하세요、감사합니다、韓国語全般
|
1910 |
必ず使用:こんにちは、ありがとうございます、日本語のみ
|
1911 |
"""
|
1912 |
+
base_instructions = "あなたは日本語のみを話す親切なアシスタントです。"
|
1913 |
+
|
1914 |
+
elif self.target_language == "zh":
|
1915 |
+
translation_instructions = """
|
1916 |
你是一个只说中文的助手。
|
1917 |
|
1918 |
绝对规则:
|
|
|
1926 |
禁止使用:안녕하세요、감사합니다、任何韩语
|
1927 |
必须使用:你好、谢谢、只用中文
|
1928 |
"""
|
1929 |
+
base_instructions = "你是一个只说中文的友好助手。"
|
1930 |
+
|
1931 |
+
elif self.target_language == "es":
|
1932 |
+
translation_instructions = """
|
1933 |
ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
|
1934 |
|
1935 |
REGLAS ABSOLUTAS:
|
|
|
1943 |
NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
|
1944 |
SIEMPRE USAR: Hola, Gracias, y solo palabras en español
|
1945 |
"""
|
1946 |
+
base_instructions = "Eres un asistente útil que habla SOLO español."
|
1947 |
+
else:
|
1948 |
+
translation_instructions = f"""
|
1949 |
YOU MUST ONLY SPEAK {language_name.upper()}.
|
1950 |
|
1951 |
RULES:
|
|
|
1953 |
2. Never use Korean
|
1954 |
3. Always respond in {language_name}
|
1955 |
"""
|
1956 |
+
base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}."
|
1957 |
+
else:
|
1958 |
+
translation_instructions = ""
|
1959 |
+
|
1960 |
+
if self.web_search_enabled and self.search_client:
|
1961 |
+
tools = [{
|
1962 |
+
"type": "function",
|
1963 |
+
"function": {
|
1964 |
+
"name": "web_search",
|
1965 |
+
"description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
|
1966 |
+
"parameters": {
|
1967 |
+
"type": "object",
|
1968 |
+
"properties": {
|
1969 |
+
"query": {
|
1970 |
+
"type": "string",
|
1971 |
+
"description": "The search query"
|
1972 |
+
}
|
1973 |
+
},
|
1974 |
+
"required": ["query"]
|
1975 |
+
}
|
1976 |
+
}
|
1977 |
+
}]
|
1978 |
+
print("Web search function added to tools")
|
1979 |
+
|
1980 |
+
search_instructions = (
|
1981 |
+
"\n\nYou have web search capabilities. "
|
1982 |
+
"IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
|
1983 |
+
"- Weather (날씨, 기온, 비, 눈)\n"
|
1984 |
+
"- News (뉴스, 소식)\n"
|
1985 |
+
"- Current events (현재, 최근, 오늘, 지금)\n"
|
1986 |
+
"- Prices (가격, 환율, 주가)\n"
|
1987 |
+
"- Sports scores or results\n"
|
1988 |
+
"- Any question about 2024 or 2025\n"
|
1989 |
+
"- Any time-sensitive information\n\n"
|
1990 |
+
"When in doubt, USE web_search. It's better to search and provide accurate information "
|
1991 |
+
"than to guess or use outdated information."
|
1992 |
+
)
|
1993 |
+
|
1994 |
+
# Combine all instructions
|
1995 |
+
if translation_instructions:
|
1996 |
+
# Translation instructions already include base_instructions
|
1997 |
+
instructions = translation_instructions + search_instructions
|
1998 |
else:
|
1999 |
+
instructions = base_instructions + search_instructions
|
2000 |
+
else:
|
2001 |
+
# No web search
|
2002 |
+
if translation_instructions:
|
2003 |
+
instructions = translation_instructions
|
2004 |
+
else:
|
2005 |
+
instructions = base_instructions
|
2006 |
+
|
2007 |
+
print(f"[NORMAL MODE] Base instructions: {base_instructions[:100]}...")
|
2008 |
+
print(f"[NORMAL MODE] Translation instructions: {translation_instructions[:200] if translation_instructions else 'None'}...")
|
2009 |
+
print(f"[NORMAL MODE] Combined instructions length: {len(instructions)}")
|
2010 |
+
print(f"[NORMAL MODE] Target language: {self.target_language}")
|
2011 |
+
|
2012 |
+
async with self.client.beta.realtime.connect(
|
2013 |
+
model="gpt-4o-mini-realtime-preview-2024-12-17"
|
2014 |
+
) as conn:
|
2015 |
+
# Update session with tools
|
2016 |
+
session_update = {
|
2017 |
+
"turn_detection": {"type": "server_vad"},
|
2018 |
+
"instructions": instructions,
|
2019 |
+
"tools": tools,
|
2020 |
+
"tool_choice": "auto" if tools else "none",
|
2021 |
+
"temperature": 0.7,
|
2022 |
+
"max_response_output_tokens": 4096,
|
2023 |
+
"modalities": ["text", "audio"],
|
2024 |
+
"voice": "alloy" # Default voice
|
2025 |
+
}
|
2026 |
|
2027 |
+
# Use appropriate voice for the language
|
2028 |
+
if self.target_language:
|
2029 |
+
# Force language through multiple mechanisms
|
2030 |
+
# 1. Use voice that's known to work well with the language
|
2031 |
+
voice_map = {
|
2032 |
+
"en": "nova", # Nova has clearer English
|
2033 |
+
"es": "nova", # Nova works for Spanish
|
2034 |
+
"fr": "shimmer", # Shimmer for French
|
2035 |
+
"de": "echo", # Echo for German
|
2036 |
+
"ja": "alloy", # Alloy can do Japanese
|
2037 |
+
"zh": "alloy", # Alloy can do Chinese
|
2038 |
+
"ko": "nova", # Nova for Korean
|
2039 |
+
}
|
2040 |
+
session_update["voice"] = voice_map.get(self.target_language, "nova")
|
|
|
|
|
|
|
|
|
|
|
2041 |
|
2042 |
+
# 2. Add language to modalities (experimental)
|
2043 |
+
session_update["modalities"] = ["text", "audio"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2044 |
|
2045 |
+
# 3. Set output format
|
2046 |
+
session_update["output_audio_format"] = "pcm16"
|
2047 |
+
|
2048 |
+
# 4. Add language hint to the system (if supported by API)
|
2049 |
+
if self.target_language in ["en", "es", "fr", "de", "ja", "zh"]:
|
2050 |
+
session_update["language"] = self.target_language # Try setting language directly
|
2051 |
+
|
2052 |
+
print(f"[TRANSLATION MODE] Session update: {json.dumps(session_update, indent=2)}")
|
|
|
|
|
|
|
|
|
2053 |
|
2054 |
+
await conn.session.update(session=session_update)
|
2055 |
+
self.connection = conn
|
2056 |
+
print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
|
|
|
2057 |
|
2058 |
+
async for event in self.connection:
|
2059 |
+
# Debug logging for function calls
|
2060 |
+
if event.type.startswith("response.function_call"):
|
2061 |
+
print(f"Function event: {event.type}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2062 |
|
2063 |
+
if event.type == "response.audio_transcript.done":
|
2064 |
+
print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
|
2065 |
+
print(f"[RESPONSE] Expected language: {self.target_language}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2066 |
|
2067 |
+
output_data = {
|
2068 |
+
"event": event,
|
2069 |
+
"language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
|
2070 |
+
}
|
2071 |
+
await self.output_queue.put(AdditionalOutputs(output_data))
|
2072 |
|
2073 |
+
elif event.type == "response.audio.delta":
|
2074 |
+
await self.output_queue.put(
|
2075 |
+
(
|
2076 |
+
self.output_sample_rate,
|
2077 |
+
np.frombuffer(
|
2078 |
+
base64.b64decode(event.delta), dtype=np.int16
|
2079 |
+
).reshape(1, -1),
|
2080 |
+
),
|
2081 |
+
)
|
2082 |
|
2083 |
+
# Handle function calls (only in non-interpretation mode)
|
2084 |
+
elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
|
2085 |
+
print(f"Function call started")
|
2086 |
+
self.function_call_in_progress = True
|
2087 |
+
self.current_function_args = ""
|
2088 |
+
self.current_call_id = getattr(event, 'call_id', None)
|
2089 |
+
|
2090 |
+
elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
|
2091 |
+
if self.function_call_in_progress:
|
2092 |
+
self.current_function_args += event.delta
|
2093 |
+
|
2094 |
+
elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
|
2095 |
+
if self.function_call_in_progress:
|
2096 |
+
print(f"Function call done, args: {self.current_function_args}")
|
2097 |
+
try:
|
2098 |
+
args = json.loads(self.current_function_args)
|
2099 |
+
query = args.get("query", "")
|
2100 |
+
|
2101 |
+
# Emit search event to client
|
2102 |
+
await self.output_queue.put(AdditionalOutputs({
|
2103 |
+
"type": "search",
|
2104 |
+
"query": query
|
2105 |
+
}))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2106 |
|
2107 |
+
# Perform the search
|
2108 |
+
search_results = await self.search_web(query)
|
2109 |
+
print(f"Search results length: {len(search_results)}")
|
2110 |
+
|
2111 |
+
# Send function result back to the model
|
2112 |
+
if self.connection and self.current_call_id:
|
2113 |
+
await self.connection.conversation.item.create(
|
2114 |
+
item={
|
2115 |
+
"type": "function_call_output",
|
2116 |
+
"call_id": self.current_call_id,
|
2117 |
+
"output": search_results
|
2118 |
+
}
|
2119 |
+
)
|
2120 |
+
await self.connection.response.create()
|
2121 |
+
|
2122 |
+
except Exception as e:
|
2123 |
+
print(f"Function call error: {e}")
|
2124 |
+
finally:
|
2125 |
+
self.function_call_in_progress = False
|
2126 |
+
self.current_function_args = ""
|
2127 |
+
self.current_call_id = None
|
2128 |
|
2129 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
2130 |
if self.interpretation_mode:
|
2131 |
+
print(f"[RECEIVE] Processing in interpretation mode")
|
2132 |
# In interpretation mode, buffer audio and process with Whisper
|
2133 |
_, array = frame
|
2134 |
array = array.squeeze()
|
|
|
2154 |
else:
|
2155 |
# Normal mode - use Realtime API
|
2156 |
if not self.connection:
|
2157 |
+
print(f"[RECEIVE] No connection in normal mode, skipping")
|
2158 |
return
|
2159 |
try:
|
2160 |
_, array = frame
|
|
|
2163 |
await self.connection.input_audio_buffer.append(audio=audio_message)
|
2164 |
except Exception as e:
|
2165 |
print(f"Error in receive: {e}")
|
|
|
2166 |
|
2167 |
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
|
2168 |
+
# In interpretation mode, we need to keep checking for outputs
|
2169 |
if self.interpretation_mode:
|
2170 |
+
# Use a short timeout to prevent blocking
|
2171 |
try:
|
2172 |
+
item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.05)
|
2173 |
return item
|
2174 |
except asyncio.TimeoutError:
|
2175 |
+
# Return None to keep the stream alive
|
2176 |
return None
|
2177 |
else:
|
2178 |
# Normal mode
|
|
|
2186 |
return item
|
2187 |
|
2188 |
async def shutdown(self) -> None:
|
2189 |
+
print(f"[SHUTDOWN] Called with interpretation_mode={self.interpretation_mode}")
|
2190 |
+
|
2191 |
if self.interpretation_mode:
|
2192 |
# Clean up interpretation mode
|
2193 |
self.audio_buffer = []
|
|
|
2198 |
if self.connection:
|
2199 |
await self.connection.close()
|
2200 |
self.connection = None
|
2201 |
+
print("[NORMAL MODE] Connection closed")
|
2202 |
|
2203 |
|
2204 |
# Create initial handler instance
|
|
|
2238 |
interpretation_mode = body.get("interpretation_mode", False)
|
2239 |
interpretation_languages = body.get("interpretation_languages", [])
|
2240 |
|
2241 |
+
print(f"[OFFER] Received offer with webrtc_id: {webrtc_id}")
|
2242 |
+
print(f"[OFFER] interpretation_mode: {interpretation_mode}")
|
2243 |
+
print(f"[OFFER] interpretation_languages: {interpretation_languages}")
|
2244 |
+
print(f"[OFFER] web_search_enabled: {web_search_enabled}")
|
2245 |
+
print(f"[OFFER] target_language: {target_language}")
|
2246 |
|
2247 |
# Store settings with timestamp
|
2248 |
if webrtc_id:
|
|
|
2254 |
'interpretation_languages': interpretation_languages,
|
2255 |
'timestamp': asyncio.get_event_loop().time()
|
2256 |
}
|
2257 |
+
|
2258 |
+
print(f"[OFFER] Stored settings for {webrtc_id}:")
|
2259 |
+
print(f"[OFFER] {connection_settings[webrtc_id]}")
|
2260 |
|
2261 |
# Remove our custom route temporarily
|
2262 |
custom_route = None
|
|
|
2266 |
break
|
2267 |
|
2268 |
# Forward to stream's offer handler
|
2269 |
+
print(f"[OFFER] Forwarding to stream.offer()")
|
2270 |
response = await stream.offer(body)
|
2271 |
|
2272 |
# Re-add our custom route
|
2273 |
if custom_route:
|
2274 |
app.routes.insert(0, custom_route)
|
2275 |
|
2276 |
+
print(f"[OFFER] Response status: {response.get('status', 'unknown') if isinstance(response, dict) else 'OK'}")
|
2277 |
+
|
2278 |
return response
|
2279 |
|
2280 |
|