Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1593,6 +1593,11 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1593 |
)
|
1594 |
await self.connection.response.create()
|
1595 |
|
|
|
|
|
|
|
|
|
|
|
1596 |
async def process_interpretation(self):
|
1597 |
"""Process audio buffer for interpretation - text only output"""
|
1598 |
if not self.audio_buffer or not self.interpretation_languages:
|
@@ -1620,7 +1625,7 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1620 |
transcript = await self.client.audio.transcriptions.create(
|
1621 |
model="whisper-1",
|
1622 |
file=wav_buffer,
|
1623 |
-
|
1624 |
)
|
1625 |
|
1626 |
user_text = transcript.text.strip()
|
@@ -1672,19 +1677,22 @@ class OpenAIHandler(AsyncStreamHandler):
|
|
1672 |
examples = translation_examples.get(lang_code, translation_examples.get("en", {}))
|
1673 |
examples_text = "\n".join([f'"{k}" → "{v}"' for k, v in examples.items()])
|
1674 |
|
1675 |
-
system_prompt = f"""You are a Korean to {target_lang_name}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1676 |
|
1677 |
-
|
1678 |
-
|
1679 |
-
2. Do NOT output Korean
|
1680 |
-
3. Do NOT add explanations
|
1681 |
-
4. Do NOT answer questions
|
1682 |
-
5. Just translate
|
1683 |
|
1684 |
Examples:
|
1685 |
{examples_text}
|
1686 |
|
1687 |
-
|
1688 |
|
1689 |
print(f"[INTERPRETATION] Translating to {target_lang_name}...")
|
1690 |
|
@@ -1697,7 +1705,7 @@ Now translate the Korean text to {target_lang_name}. Output ONLY the translation
|
|
1697 |
},
|
1698 |
{
|
1699 |
"role": "user",
|
1700 |
-
"content":
|
1701 |
}
|
1702 |
],
|
1703 |
temperature=0.1,
|
@@ -1793,21 +1801,23 @@ Now translate the Korean text to {target_lang_name}. Output ONLY the translation
|
|
1793 |
print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini (text only)")
|
1794 |
print(f"[INTERPRETATION MODE] Target languages: {self.interpretation_languages}")
|
1795 |
# Just keep the handler ready to process audio
|
1796 |
-
|
|
|
1797 |
return
|
1798 |
|
1799 |
# Normal mode - connect to Realtime API
|
1800 |
-
|
1801 |
-
|
1802 |
-
|
1803 |
-
|
1804 |
-
# Add translation instructions if language is selected
|
1805 |
-
if self.target_language:
|
1806 |
-
language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
|
1807 |
|
1808 |
-
#
|
1809 |
-
if self.target_language
|
1810 |
-
|
|
|
|
|
|
|
|
|
1811 |
YOU ARE AN ENGLISH-ONLY ASSISTANT.
|
1812 |
|
1813 |
ABSOLUTE RULES:
|
@@ -1821,11 +1831,11 @@ YOUR LANGUAGE MODE: ENGLISH ONLY
|
|
1821 |
DO NOT USE: 안녕하세요, 감사합니다, or any Korean
|
1822 |
ALWAYS USE: Hello, Thank you, and English words only
|
1823 |
"""
|
1824 |
-
|
1825 |
-
|
1826 |
-
|
1827 |
-
|
1828 |
-
|
1829 |
あなたは日本語のみを話すアシスタントです。
|
1830 |
|
1831 |
絶対的なルール:
|
@@ -1839,10 +1849,10 @@ ALWAYS USE: Hello, Thank you, and English words only
|
|
1839 |
使用禁止:안녕하세요、감사합니다、韓国語全般
|
1840 |
必ず使用:こんにちは、ありがとうございます、日本語のみ
|
1841 |
"""
|
1842 |
-
|
1843 |
-
|
1844 |
-
|
1845 |
-
|
1846 |
你是一个只说中文的助手。
|
1847 |
|
1848 |
绝对规则:
|
@@ -1856,10 +1866,10 @@ ALWAYS USE: Hello, Thank you, and English words only
|
|
1856 |
禁止使用:안녕하세요、감사합니다、任何韩语
|
1857 |
必须使用:你好、谢谢、只用中文
|
1858 |
"""
|
1859 |
-
|
1860 |
-
|
1861 |
-
|
1862 |
-
|
1863 |
ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
|
1864 |
|
1865 |
REGLAS ABSOLUTAS:
|
@@ -1873,9 +1883,9 @@ MODO DE IDIOMA: SOLO ESPAÑOL
|
|
1873 |
NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
|
1874 |
SIEMPRE USAR: Hola, Gracias, y solo palabras en español
|
1875 |
"""
|
1876 |
-
|
1877 |
-
|
1878 |
-
|
1879 |
YOU MUST ONLY SPEAK {language_name.upper()}.
|
1880 |
|
1881 |
RULES:
|
@@ -1883,178 +1893,178 @@ RULES:
|
|
1883 |
2. Never use Korean
|
1884 |
3. Always respond in {language_name}
|
1885 |
"""
|
1886 |
-
|
1887 |
-
else:
|
1888 |
-
translation_instructions = ""
|
1889 |
-
|
1890 |
-
if self.web_search_enabled and self.search_client:
|
1891 |
-
tools = [{
|
1892 |
-
"type": "function",
|
1893 |
-
"function": {
|
1894 |
-
"name": "web_search",
|
1895 |
-
"description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
|
1896 |
-
"parameters": {
|
1897 |
-
"type": "object",
|
1898 |
-
"properties": {
|
1899 |
-
"query": {
|
1900 |
-
"type": "string",
|
1901 |
-
"description": "The search query"
|
1902 |
-
}
|
1903 |
-
},
|
1904 |
-
"required": ["query"]
|
1905 |
-
}
|
1906 |
-
}
|
1907 |
-
}]
|
1908 |
-
print("Web search function added to tools")
|
1909 |
-
|
1910 |
-
search_instructions = (
|
1911 |
-
"\n\nYou have web search capabilities. "
|
1912 |
-
"IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
|
1913 |
-
"- Weather (날씨, 기온, 비, 눈)\n"
|
1914 |
-
"- News (뉴스, 소식)\n"
|
1915 |
-
"- Current events (현재, 최근, 오늘, 지금)\n"
|
1916 |
-
"- Prices (가격, 환율, 주가)\n"
|
1917 |
-
"- Sports scores or results\n"
|
1918 |
-
"- Any question about 2024 or 2025\n"
|
1919 |
-
"- Any time-sensitive information\n\n"
|
1920 |
-
"When in doubt, USE web_search. It's better to search and provide accurate information "
|
1921 |
-
"than to guess or use outdated information."
|
1922 |
-
)
|
1923 |
-
|
1924 |
-
# Combine all instructions
|
1925 |
-
if translation_instructions:
|
1926 |
-
# Translation instructions already include base_instructions
|
1927 |
-
instructions = translation_instructions + search_instructions
|
1928 |
-
else:
|
1929 |
-
instructions = base_instructions + search_instructions
|
1930 |
-
else:
|
1931 |
-
# No web search
|
1932 |
-
if translation_instructions:
|
1933 |
-
instructions = translation_instructions
|
1934 |
else:
|
1935 |
-
|
1936 |
-
|
1937 |
-
print(f"[NORMAL MODE] Base instructions: {base_instructions[:100]}...")
|
1938 |
-
print(f"[NORMAL MODE] Translation instructions: {translation_instructions[:200] if translation_instructions else 'None'}...")
|
1939 |
-
print(f"[NORMAL MODE] Combined instructions length: {len(instructions)}")
|
1940 |
-
print(f"[NORMAL MODE] Target language: {self.target_language}")
|
1941 |
-
|
1942 |
-
async with self.client.beta.realtime.connect(
|
1943 |
-
model="gpt-4o-mini-realtime-preview-2024-12-17"
|
1944 |
-
) as conn:
|
1945 |
-
# Update session with tools
|
1946 |
-
session_update = {
|
1947 |
-
"turn_detection": {"type": "server_vad"},
|
1948 |
-
"instructions": instructions,
|
1949 |
-
"tools": tools,
|
1950 |
-
"tool_choice": "auto" if tools else "none",
|
1951 |
-
"temperature": 0.7,
|
1952 |
-
"max_response_output_tokens": 4096,
|
1953 |
-
"modalities": ["text", "audio"],
|
1954 |
-
"voice": "alloy" # Default voice
|
1955 |
-
}
|
1956 |
|
1957 |
-
|
1958 |
-
|
1959 |
-
|
1960 |
-
|
1961 |
-
|
1962 |
-
|
1963 |
-
|
1964 |
-
|
1965 |
-
|
1966 |
-
|
1967 |
-
|
1968 |
-
|
1969 |
-
|
1970 |
-
|
1971 |
-
|
1972 |
-
|
1973 |
-
|
1974 |
-
|
1975 |
-
|
1976 |
-
session_update["output_audio_format"] = "pcm16"
|
1977 |
|
1978 |
-
|
1979 |
-
|
1980 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1981 |
|
1982 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1983 |
|
1984 |
-
|
1985 |
-
|
1986 |
-
print(f"
|
|
|
1987 |
|
1988 |
-
async
|
1989 |
-
|
1990 |
-
|
1991 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1992 |
|
1993 |
-
|
1994 |
-
|
1995 |
-
|
1996 |
-
|
1997 |
-
|
1998 |
-
"
|
1999 |
-
"
|
|
|
|
|
|
|
|
|
|
|
2000 |
}
|
2001 |
-
|
2002 |
-
|
2003 |
-
|
2004 |
-
|
2005 |
-
|
2006 |
-
|
2007 |
-
|
2008 |
-
|
2009 |
-
|
2010 |
-
|
2011 |
-
|
2012 |
-
|
2013 |
-
|
2014 |
-
elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
|
2015 |
-
print(f"Function call started")
|
2016 |
-
self.function_call_in_progress = True
|
2017 |
-
self.current_function_args = ""
|
2018 |
-
self.current_call_id = getattr(event, 'call_id', None)
|
2019 |
|
2020 |
-
|
2021 |
-
|
2022 |
-
|
2023 |
|
2024 |
-
|
2025 |
-
|
2026 |
-
|
2027 |
-
|
2028 |
-
|
2029 |
-
|
2030 |
-
|
2031 |
-
|
2032 |
-
await self.output_queue.put(AdditionalOutputs({
|
2033 |
-
"type": "search",
|
2034 |
-
"query": query
|
2035 |
-
}))
|
2036 |
-
|
2037 |
-
# Perform the search
|
2038 |
-
search_results = await self.search_web(query)
|
2039 |
-
print(f"Search results length: {len(search_results)}")
|
2040 |
-
|
2041 |
-
# Send function result back to the model
|
2042 |
-
if self.connection and self.current_call_id:
|
2043 |
-
await self.connection.conversation.item.create(
|
2044 |
-
item={
|
2045 |
-
"type": "function_call_output",
|
2046 |
-
"call_id": self.current_call_id,
|
2047 |
-
"output": search_results
|
2048 |
-
}
|
2049 |
-
)
|
2050 |
-
await self.connection.response.create()
|
2051 |
|
2052 |
-
|
2053 |
-
|
2054 |
-
|
2055 |
-
|
2056 |
-
|
2057 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2058 |
|
2059 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
2060 |
if self.interpretation_mode:
|
|
|
1593 |
)
|
1594 |
await self.connection.response.create()
|
1595 |
|
1596 |
+
async def _interpretation_loop(self):
|
1597 |
+
"""Keep the handler alive for interpretation mode"""
|
1598 |
+
while self.interpretation_mode:
|
1599 |
+
await asyncio.sleep(0.1)
|
1600 |
+
|
1601 |
async def process_interpretation(self):
|
1602 |
"""Process audio buffer for interpretation - text only output"""
|
1603 |
if not self.audio_buffer or not self.interpretation_languages:
|
|
|
1625 |
transcript = await self.client.audio.transcriptions.create(
|
1626 |
model="whisper-1",
|
1627 |
file=wav_buffer,
|
1628 |
+
response_format="text" # Get only text, no timestamps
|
1629 |
)
|
1630 |
|
1631 |
user_text = transcript.text.strip()
|
|
|
1677 |
examples = translation_examples.get(lang_code, translation_examples.get("en", {}))
|
1678 |
examples_text = "\n".join([f'"{k}" → "{v}"' for k, v in examples.items()])
|
1679 |
|
1680 |
+
system_prompt = f"""You are a direct translator from Korean to {target_lang_name}.
|
1681 |
+
|
1682 |
+
CRITICAL RULES:
|
1683 |
+
1. TRANSLATE ONLY - Do not answer questions
|
1684 |
+
2. Do not generate responses
|
1685 |
+
3. Do not add any commentary
|
1686 |
+
4. Output ONLY the direct translation
|
1687 |
+
5. Preserve the original meaning exactly
|
1688 |
|
1689 |
+
If the user says "오늘 날씨 어때?" translate it as "How's the weather today?" NOT answer about the weather.
|
1690 |
+
If the user says "안녕하세요" translate it as "Hello" NOT respond with greetings.
|
|
|
|
|
|
|
|
|
1691 |
|
1692 |
Examples:
|
1693 |
{examples_text}
|
1694 |
|
1695 |
+
Direct translation to {target_lang_name}:"""
|
1696 |
|
1697 |
print(f"[INTERPRETATION] Translating to {target_lang_name}...")
|
1698 |
|
|
|
1705 |
},
|
1706 |
{
|
1707 |
"role": "user",
|
1708 |
+
"content": user_text # Just the text, no additional prompt
|
1709 |
}
|
1710 |
],
|
1711 |
temperature=0.1,
|
|
|
1801 |
print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini (text only)")
|
1802 |
print(f"[INTERPRETATION MODE] Target languages: {self.interpretation_languages}")
|
1803 |
# Just keep the handler ready to process audio
|
1804 |
+
# Keep the emit loop running for interpretation results
|
1805 |
+
asyncio.create_task(self._interpretation_loop())
|
1806 |
return
|
1807 |
|
1808 |
# Normal mode - connect to Realtime API
|
1809 |
+
if not self.interpretation_mode:
|
1810 |
+
# Define the web search function
|
1811 |
+
tools = []
|
1812 |
+
base_instructions = self.system_prompt or "You are a helpful assistant."
|
|
|
|
|
|
|
1813 |
|
1814 |
+
# Add translation instructions if language is selected
|
1815 |
+
if self.target_language:
|
1816 |
+
language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
|
1817 |
+
|
1818 |
+
# Use the target language for the system prompt itself
|
1819 |
+
if self.target_language == "en":
|
1820 |
+
translation_instructions = """
|
1821 |
YOU ARE AN ENGLISH-ONLY ASSISTANT.
|
1822 |
|
1823 |
ABSOLUTE RULES:
|
|
|
1831 |
DO NOT USE: 안녕하세요, 감사합니다, or any Korean
|
1832 |
ALWAYS USE: Hello, Thank you, and English words only
|
1833 |
"""
|
1834 |
+
# Override base instructions to be in English
|
1835 |
+
base_instructions = "You are a helpful assistant that speaks ONLY English."
|
1836 |
+
|
1837 |
+
elif self.target_language == "ja":
|
1838 |
+
translation_instructions = """
|
1839 |
あなたは日本語のみを話すアシスタントです。
|
1840 |
|
1841 |
絶対的なルール:
|
|
|
1849 |
使用禁止:안녕하세요、감사합니다、韓国語全般
|
1850 |
必ず使用:こんにちは、ありがとうございます、日本語のみ
|
1851 |
"""
|
1852 |
+
base_instructions = "あなたは日本語のみを話す親切なアシスタントです。"
|
1853 |
+
|
1854 |
+
elif self.target_language == "zh":
|
1855 |
+
translation_instructions = """
|
1856 |
你是一个只说中文的助手。
|
1857 |
|
1858 |
绝对规则:
|
|
|
1866 |
禁止使用:안녕하세요、감사합니다、任何韩语
|
1867 |
必须使用:你好、谢谢、只用中文
|
1868 |
"""
|
1869 |
+
base_instructions = "你是一个只说中文的友好助手。"
|
1870 |
+
|
1871 |
+
elif self.target_language == "es":
|
1872 |
+
translation_instructions = """
|
1873 |
ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
|
1874 |
|
1875 |
REGLAS ABSOLUTAS:
|
|
|
1883 |
NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
|
1884 |
SIEMPRE USAR: Hola, Gracias, y solo palabras en español
|
1885 |
"""
|
1886 |
+
base_instructions = "Eres un asistente útil que habla SOLO español."
|
1887 |
+
else:
|
1888 |
+
translation_instructions = f"""
|
1889 |
YOU MUST ONLY SPEAK {language_name.upper()}.
|
1890 |
|
1891 |
RULES:
|
|
|
1893 |
2. Never use Korean
|
1894 |
3. Always respond in {language_name}
|
1895 |
"""
|
1896 |
+
base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1897 |
else:
|
1898 |
+
translation_instructions = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1899 |
|
1900 |
+
if self.web_search_enabled and self.search_client:
|
1901 |
+
tools = [{
|
1902 |
+
"type": "function",
|
1903 |
+
"function": {
|
1904 |
+
"name": "web_search",
|
1905 |
+
"description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
|
1906 |
+
"parameters": {
|
1907 |
+
"type": "object",
|
1908 |
+
"properties": {
|
1909 |
+
"query": {
|
1910 |
+
"type": "string",
|
1911 |
+
"description": "The search query"
|
1912 |
+
}
|
1913 |
+
},
|
1914 |
+
"required": ["query"]
|
1915 |
+
}
|
1916 |
+
}
|
1917 |
+
}]
|
1918 |
+
print("Web search function added to tools")
|
|
|
1919 |
|
1920 |
+
search_instructions = (
|
1921 |
+
"\n\nYou have web search capabilities. "
|
1922 |
+
"IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
|
1923 |
+
"- Weather (날씨, 기온, 비, 눈)\n"
|
1924 |
+
"- News (뉴스, 소식)\n"
|
1925 |
+
"- Current events (현재, 최근, 오늘, 지금)\n"
|
1926 |
+
"- Prices (가격, 환율, 주가)\n"
|
1927 |
+
"- Sports scores or results\n"
|
1928 |
+
"- Any question about 2024 or 2025\n"
|
1929 |
+
"- Any time-sensitive information\n\n"
|
1930 |
+
"When in doubt, USE web_search. It's better to search and provide accurate information "
|
1931 |
+
"than to guess or use outdated information."
|
1932 |
+
)
|
1933 |
|
1934 |
+
# Combine all instructions
|
1935 |
+
if translation_instructions:
|
1936 |
+
# Translation instructions already include base_instructions
|
1937 |
+
instructions = translation_instructions + search_instructions
|
1938 |
+
else:
|
1939 |
+
instructions = base_instructions + search_instructions
|
1940 |
+
else:
|
1941 |
+
# No web search
|
1942 |
+
if translation_instructions:
|
1943 |
+
instructions = translation_instructions
|
1944 |
+
else:
|
1945 |
+
instructions = base_instructions
|
1946 |
|
1947 |
+
print(f"[NORMAL MODE] Base instructions: {base_instructions[:100]}...")
|
1948 |
+
print(f"[NORMAL MODE] Translation instructions: {translation_instructions[:200] if translation_instructions else 'None'}...")
|
1949 |
+
print(f"[NORMAL MODE] Combined instructions length: {len(instructions)}")
|
1950 |
+
print(f"[NORMAL MODE] Target language: {self.target_language}")
|
1951 |
|
1952 |
+
async with self.client.beta.realtime.connect(
|
1953 |
+
model="gpt-4o-mini-realtime-preview-2024-12-17"
|
1954 |
+
) as conn:
|
1955 |
+
# Update session with tools
|
1956 |
+
session_update = {
|
1957 |
+
"turn_detection": {"type": "server_vad"},
|
1958 |
+
"instructions": instructions,
|
1959 |
+
"tools": tools,
|
1960 |
+
"tool_choice": "auto" if tools else "none",
|
1961 |
+
"temperature": 0.7,
|
1962 |
+
"max_response_output_tokens": 4096,
|
1963 |
+
"modalities": ["text", "audio"],
|
1964 |
+
"voice": "alloy" # Default voice
|
1965 |
+
}
|
1966 |
|
1967 |
+
# Use appropriate voice for the language
|
1968 |
+
if self.target_language:
|
1969 |
+
# Force language through multiple mechanisms
|
1970 |
+
# 1. Use voice that's known to work well with the language
|
1971 |
+
voice_map = {
|
1972 |
+
"en": "nova", # Nova has clearer English
|
1973 |
+
"es": "nova", # Nova works for Spanish
|
1974 |
+
"fr": "shimmer", # Shimmer for French
|
1975 |
+
"de": "echo", # Echo for German
|
1976 |
+
"ja": "alloy", # Alloy can do Japanese
|
1977 |
+
"zh": "alloy", # Alloy can do Chinese
|
1978 |
+
"ko": "nova", # Nova for Korean
|
1979 |
}
|
1980 |
+
session_update["voice"] = voice_map.get(self.target_language, "nova")
|
1981 |
+
|
1982 |
+
# 2. Add language to modalities (experimental)
|
1983 |
+
session_update["modalities"] = ["text", "audio"]
|
1984 |
+
|
1985 |
+
# 3. Set output format
|
1986 |
+
session_update["output_audio_format"] = "pcm16"
|
1987 |
+
|
1988 |
+
# 4. Add language hint to the system (if supported by API)
|
1989 |
+
if self.target_language in ["en", "es", "fr", "de", "ja", "zh"]:
|
1990 |
+
session_update["language"] = self.target_language # Try setting language directly
|
1991 |
+
|
1992 |
+
print(f"[TRANSLATION MODE] Session update: {json.dumps(session_update, indent=2)}")
|
|
|
|
|
|
|
|
|
|
|
1993 |
|
1994 |
+
await conn.session.update(session=session_update)
|
1995 |
+
self.connection = conn
|
1996 |
+
print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
|
1997 |
|
1998 |
+
async for event in self.connection:
|
1999 |
+
# Debug logging for function calls
|
2000 |
+
if event.type.startswith("response.function_call"):
|
2001 |
+
print(f"Function event: {event.type}")
|
2002 |
+
|
2003 |
+
if event.type == "response.audio_transcript.done":
|
2004 |
+
print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
|
2005 |
+
print(f"[RESPONSE] Expected language: {self.target_language}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2006 |
|
2007 |
+
output_data = {
|
2008 |
+
"event": event,
|
2009 |
+
"language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
|
2010 |
+
}
|
2011 |
+
await self.output_queue.put(AdditionalOutputs(output_data))
|
2012 |
+
|
2013 |
+
elif event.type == "response.audio.delta":
|
2014 |
+
await self.output_queue.put(
|
2015 |
+
(
|
2016 |
+
self.output_sample_rate,
|
2017 |
+
np.frombuffer(
|
2018 |
+
base64.b64decode(event.delta), dtype=np.int16
|
2019 |
+
).reshape(1, -1),
|
2020 |
+
),
|
2021 |
+
)
|
2022 |
+
|
2023 |
+
# Handle function calls (only in non-interpretation mode)
|
2024 |
+
elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
|
2025 |
+
print(f"Function call started")
|
2026 |
+
self.function_call_in_progress = True
|
2027 |
+
self.current_function_args = ""
|
2028 |
+
self.current_call_id = getattr(event, 'call_id', None)
|
2029 |
+
|
2030 |
+
elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
|
2031 |
+
if self.function_call_in_progress:
|
2032 |
+
self.current_function_args += event.delta
|
2033 |
+
|
2034 |
+
elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
|
2035 |
+
if self.function_call_in_progress:
|
2036 |
+
print(f"Function call done, args: {self.current_function_args}")
|
2037 |
+
try:
|
2038 |
+
args = json.loads(self.current_function_args)
|
2039 |
+
query = args.get("query", "")
|
2040 |
+
|
2041 |
+
# Emit search event to client
|
2042 |
+
await self.output_queue.put(AdditionalOutputs({
|
2043 |
+
"type": "search",
|
2044 |
+
"query": query
|
2045 |
+
}))
|
2046 |
+
|
2047 |
+
# Perform the search
|
2048 |
+
search_results = await self.search_web(query)
|
2049 |
+
print(f"Search results length: {len(search_results)}")
|
2050 |
+
|
2051 |
+
# Send function result back to the model
|
2052 |
+
if self.connection and self.current_call_id:
|
2053 |
+
await self.connection.conversation.item.create(
|
2054 |
+
item={
|
2055 |
+
"type": "function_call_output",
|
2056 |
+
"call_id": self.current_call_id,
|
2057 |
+
"output": search_results
|
2058 |
+
}
|
2059 |
+
)
|
2060 |
+
await self.connection.response.create()
|
2061 |
+
|
2062 |
+
except Exception as e:
|
2063 |
+
print(f"Function call error: {e}")
|
2064 |
+
finally:
|
2065 |
+
self.function_call_in_progress = False
|
2066 |
+
self.current_function_args = ""
|
2067 |
+
self.current_call_id = None
|
2068 |
|
2069 |
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
2070 |
if self.interpretation_mode:
|