Private-AI

Running

App Files Files Community

seawolf2357 commited on 19 days ago

Commit

753be88

verified ·

1 Parent(s): ae55ef4

Update app.py

Browse files

Files changed (1) hide show

app.py +208 -198

app.py CHANGED Viewed

@@ -1593,6 +1593,11 @@ class OpenAIHandler(AsyncStreamHandler):
             )
             await self.connection.response.create()
     async def process_interpretation(self):
         """Process audio buffer for interpretation - text only output"""
         if not self.audio_buffer or not self.interpretation_languages:
@@ -1620,7 +1625,7 @@ class OpenAIHandler(AsyncStreamHandler):
             transcript = await self.client.audio.transcriptions.create(
                 model="whisper-1",
                 file=wav_buffer,
-                language="ko"  # Assuming Korean input
             )
             user_text = transcript.text.strip()
@@ -1672,19 +1677,22 @@ class OpenAIHandler(AsyncStreamHandler):
                 examples = translation_examples.get(lang_code, translation_examples.get("en", {}))
                 examples_text = "\n".join([f'"{k}" → "{v}"' for k, v in examples.items()])
-                system_prompt = f"""You are a Korean to {target_lang_name} translator.
-STRICT RULES:
-1. Output ONLY the {target_lang_name} translation
-2. Do NOT output Korean
-3. Do NOT add explanations
-4. Do NOT answer questions
-5. Just translate
 Examples:
 {examples_text}
-Now translate the Korean text to {target_lang_name}. Output ONLY the translation in {target_lang_name}:"""
                 print(f"[INTERPRETATION] Translating to {target_lang_name}...")
@@ -1697,7 +1705,7 @@ Now translate the Korean text to {target_lang_name}. Output ONLY the translation
                         },
                         {
                             "role": "user",
-                            "content": f"Translate this Korean to {target_lang_name}: {user_text}"
                         }
                     ],
                     temperature=0.1,
@@ -1793,21 +1801,23 @@ Now translate the Korean text to {target_lang_name}. Output ONLY the translation
             print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini (text only)")
             print(f"[INTERPRETATION MODE] Target languages: {self.interpretation_languages}")
             # Just keep the handler ready to process audio
-            self.client = openai.AsyncOpenAI()
             return
         # Normal mode - connect to Realtime API
-        # Define the web search function
-        tools = []
-        base_instructions = self.system_prompt or "You are a helpful assistant."
-        # Add translation instructions if language is selected
-        if self.target_language:
-            language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
-            # Use the target language for the system prompt itself
-            if self.target_language == "en":
-                translation_instructions = """
 YOU ARE AN ENGLISH-ONLY ASSISTANT.
 ABSOLUTE RULES:
@@ -1821,11 +1831,11 @@ YOUR LANGUAGE MODE: ENGLISH ONLY
 DO NOT USE: 안녕하세요, 감사합니다, or any Korean
 ALWAYS USE: Hello, Thank you, and English words only
 """
-                # Override base instructions to be in English
-                base_instructions = "You are a helpful assistant that speaks ONLY English."
-            elif self.target_language == "ja":
-                translation_instructions = """
 あなたは日本語のみを話すアシスタントです。
 絶対的なルール：
@@ -1839,10 +1849,10 @@ ALWAYS USE: Hello, Thank you, and English words only
 使用禁止：안녕하세요、감사합니다、韓国語全般
 必ず使用：こんにちは、ありがとうございます、日本語のみ
 """
-                base_instructions = "あなたは日本語のみを話す親切なアシスタントです。"
-            elif self.target_language == "zh":
-                translation_instructions = """
 你是一个只说中文的助手。
 绝对规则：
@@ -1856,10 +1866,10 @@ ALWAYS USE: Hello, Thank you, and English words only
 禁止使用：안녕하세요、감사합니다、任何韩语
 必须使用：你好、谢谢、只用中文
 """
-                base_instructions = "你是一个只说中文的友好助手。"
-            elif self.target_language == "es":
-                translation_instructions = """
 ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
 REGLAS ABSOLUTAS:
@@ -1873,9 +1883,9 @@ MODO DE IDIOMA: SOLO ESPAÑOL
 NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
 SIEMPRE USAR: Hola, Gracias, y solo palabras en español
 """
-                base_instructions = "Eres un asistente útil que habla SOLO español."
-            else:
-                translation_instructions = f"""
 YOU MUST ONLY SPEAK {language_name.upper()}.
 RULES:
@@ -1883,178 +1893,178 @@ RULES:
 2. Never use Korean
 3. Always respond in {language_name}
 """
-                base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}."
-        else:
-            translation_instructions = ""
-        if self.web_search_enabled and self.search_client:
-            tools = [{
-                "type": "function",
-                "function": {
-                    "name": "web_search",
-                    "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
-                    "parameters": {
-                        "type": "object",
-                        "properties": {
-                            "query": {
-                                "type": "string",
-                                "description": "The search query"
-                            }
-                        },
-                        "required": ["query"]
-                    }
-                }
-            }]
-            print("Web search function added to tools")
-            search_instructions = (
-                "\n\nYou have web search capabilities. "
-                "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
-                "- Weather (날씨, 기온, 비, 눈)\n"
-                "- News (뉴스, 소식)\n"
-                "- Current events (현재, 최근, 오늘, 지금)\n"
-                "- Prices (가격, 환율, 주가)\n"
-                "- Sports scores or results\n"
-                "- Any question about 2024 or 2025\n"
-                "- Any time-sensitive information\n\n"
-                "When in doubt, USE web_search. It's better to search and provide accurate information "
-                "than to guess or use outdated information."
-            )
-            # Combine all instructions
-            if translation_instructions:
-                # Translation instructions already include base_instructions
-                instructions = translation_instructions + search_instructions
-            else:
-                instructions = base_instructions + search_instructions
-        else:
-            # No web search
-            if translation_instructions:
-                instructions = translation_instructions
             else:
-                instructions = base_instructions
-        print(f"[NORMAL MODE] Base instructions: {base_instructions[:100]}...")
-        print(f"[NORMAL MODE] Translation instructions: {translation_instructions[:200] if translation_instructions else 'None'}...")
-        print(f"[NORMAL MODE] Combined instructions length: {len(instructions)}")
-        print(f"[NORMAL MODE] Target language: {self.target_language}")
-        async with self.client.beta.realtime.connect(
-            model="gpt-4o-mini-realtime-preview-2024-12-17"
-        ) as conn:
-            # Update session with tools
-            session_update = {
-                "turn_detection": {"type": "server_vad"},
-                "instructions": instructions,
-                "tools": tools,
-                "tool_choice": "auto" if tools else "none",
-                "temperature": 0.7,
-                "max_response_output_tokens": 4096,
-                "modalities": ["text", "audio"],
-                "voice": "alloy"  # Default voice
-            }
-            # Use appropriate voice for the language
-            if self.target_language:
-                # Force language through multiple mechanisms
-                # 1. Use voice that's known to work well with the language
-                voice_map = {
-                    "en": "nova",      # Nova has clearer English
-                    "es": "nova",      # Nova works for Spanish
-                    "fr": "shimmer",   # Shimmer for French
-                    "de": "echo",      # Echo for German
-                    "ja": "alloy",     # Alloy can do Japanese
-                    "zh": "alloy",     # Alloy can do Chinese
-                    "ko": "nova",      # Nova for Korean
-                }
-                session_update["voice"] = voice_map.get(self.target_language, "nova")
-                # 2. Add language to modalities (experimental)
-                session_update["modalities"] = ["text", "audio"]
-                # 3. Set output format
-                session_update["output_audio_format"] = "pcm16"
-                # 4. Add language hint to the system (if supported by API)
-                if self.target_language in ["en", "es", "fr", "de", "ja", "zh"]:
-                    session_update["language"] = self.target_language  # Try setting language directly
-                print(f"[TRANSLATION MODE] Session update: {json.dumps(session_update, indent=2)}")
-            await conn.session.update(session=session_update)
-            self.connection = conn
-            print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
-            async for event in self.connection:
-                # Debug logging for function calls
-                if event.type.startswith("response.function_call"):
-                    print(f"Function event: {event.type}")
-                if event.type == "response.audio_transcript.done":
-                    print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
-                    print(f"[RESPONSE] Expected language: {self.target_language}")
-                    output_data = {
-                        "event": event,
-                        "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
                     }
-                    await self.output_queue.put(AdditionalOutputs(output_data))
-                elif event.type == "response.audio.delta":
-                    await self.output_queue.put(
-                        (
-                            self.output_sample_rate,
-                            np.frombuffer(
-                                base64.b64decode(event.delta), dtype=np.int16
-                            ).reshape(1, -1),
-                        ),
-                    )
-                # Handle function calls (only in non-interpretation mode)
-                elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
-                    print(f"Function call started")
-                    self.function_call_in_progress = True
-                    self.current_function_args = ""
-                    self.current_call_id = getattr(event, 'call_id', None)
-                elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
-                    if self.function_call_in_progress:
-                        self.current_function_args += event.delta
-                elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
-                    if self.function_call_in_progress:
-                        print(f"Function call done, args: {self.current_function_args}")
-                        try:
-                            args = json.loads(self.current_function_args)
-                            query = args.get("query", "")
-                            # Emit search event to client
-                            await self.output_queue.put(AdditionalOutputs({
-                                "type": "search",
-                                "query": query
-                            }))
-                            # Perform the search
-                            search_results = await self.search_web(query)
-                            print(f"Search results length: {len(search_results)}")
-                            # Send function result back to the model
-                            if self.connection and self.current_call_id:
-                                await self.connection.conversation.item.create(
-                                    item={
-                                        "type": "function_call_output",
-                                        "call_id": self.current_call_id,
-                                        "output": search_results
-                                    }
-                                )
-                                await self.connection.response.create()
-                        except Exception as e:
-                            print(f"Function call error: {e}")
-                        finally:
-                            self.function_call_in_progress = False
-                            self.current_function_args = ""
-                            self.current_call_id = None
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         if self.interpretation_mode:

             )
             await self.connection.response.create()
+    async def _interpretation_loop(self):
+        """Keep the handler alive for interpretation mode"""
+        while self.interpretation_mode:
+            await asyncio.sleep(0.1)
     async def process_interpretation(self):
         """Process audio buffer for interpretation - text only output"""
         if not self.audio_buffer or not self.interpretation_languages:
             transcript = await self.client.audio.transcriptions.create(
                 model="whisper-1",
                 file=wav_buffer,
+                response_format="text"  # Get only text, no timestamps
             )
             user_text = transcript.text.strip()
                 examples = translation_examples.get(lang_code, translation_examples.get("en", {}))
                 examples_text = "\n".join([f'"{k}" → "{v}"' for k, v in examples.items()])
+                system_prompt = f"""You are a direct translator from Korean to {target_lang_name}.
+CRITICAL RULES:
+1. TRANSLATE ONLY - Do not answer questions
+2. Do not generate responses
+3. Do not add any commentary
+4. Output ONLY the direct translation
+5. Preserve the original meaning exactly
+If the user says "오늘 날씨 어때?" translate it as "How's the weather today?" NOT answer about the weather.
+If the user says "안녕하세요" translate it as "Hello" NOT respond with greetings.
 Examples:
 {examples_text}
+Direct translation to {target_lang_name}:"""
                 print(f"[INTERPRETATION] Translating to {target_lang_name}...")
                         },
                         {
                             "role": "user",
+                            "content": user_text  # Just the text, no additional prompt
                         }
                     ],
                     temperature=0.1,
             print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini (text only)")
             print(f"[INTERPRETATION MODE] Target languages: {self.interpretation_languages}")
             # Just keep the handler ready to process audio
+            # Keep the emit loop running for interpretation results
+            asyncio.create_task(self._interpretation_loop())
             return
         # Normal mode - connect to Realtime API
+        if not self.interpretation_mode:
+            # Define the web search function
+            tools = []
+            base_instructions = self.system_prompt or "You are a helpful assistant."
+            # Add translation instructions if language is selected
+            if self.target_language:
+                language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
+                # Use the target language for the system prompt itself
+                if self.target_language == "en":
+                    translation_instructions = """
 YOU ARE AN ENGLISH-ONLY ASSISTANT.
 ABSOLUTE RULES:
 DO NOT USE: 안녕하세요, 감사합니다, or any Korean
 ALWAYS USE: Hello, Thank you, and English words only
 """
+                    # Override base instructions to be in English
+                    base_instructions = "You are a helpful assistant that speaks ONLY English."
+                elif self.target_language == "ja":
+                    translation_instructions = """
 あなたは日本語のみを話すアシスタントです。
 絶対的なルール：
 使用禁止：안녕하세요、감사합니다、韓国語全般
 必ず使用：こんにちは、ありがとうございます、日本語のみ
 """
+                    base_instructions = "あなたは日本語のみを話す親切なアシスタントです。"
+                elif self.target_language == "zh":
+                    translation_instructions = """
 你是一个只说中文的助手。
 绝对规则：
 禁止使用：안녕하세요、감사합니다、任何韩语
 必须使用：你好、谢谢、只用中文
 """
+                    base_instructions = "你是一个只说中文的友好助手。"
+                elif self.target_language == "es":
+                    translation_instructions = """
 ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
 REGLAS ABSOLUTAS:
 NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
 SIEMPRE USAR: Hola, Gracias, y solo palabras en español
 """
+                    base_instructions = "Eres un asistente útil que habla SOLO español."
+                else:
+                    translation_instructions = f"""
 YOU MUST ONLY SPEAK {language_name.upper()}.
 RULES:
 2. Never use Korean
 3. Always respond in {language_name}
 """
+                    base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}."
             else:
+                translation_instructions = ""
+            if self.web_search_enabled and self.search_client:
+                tools = [{
+                    "type": "function",
+                    "function": {
+                        "name": "web_search",
+                        "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "query": {
+                                    "type": "string",
+                                    "description": "The search query"
+                                }
+                            },
+                            "required": ["query"]
+                        }
+                    }
+                }]
+                print("Web search function added to tools")
+                search_instructions = (
+                    "\n\nYou have web search capabilities. "
+                    "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
+                    "- Weather (날씨, 기온, 비, 눈)\n"
+                    "- News (뉴스, 소식)\n"
+                    "- Current events (현재, 최근, 오늘, 지금)\n"
+                    "- Prices (가격, 환율, 주가)\n"
+                    "- Sports scores or results\n"
+                    "- Any question about 2024 or 2025\n"
+                    "- Any time-sensitive information\n\n"
+                    "When in doubt, USE web_search. It's better to search and provide accurate information "
+                    "than to guess or use outdated information."
+                )
+                # Combine all instructions
+                if translation_instructions:
+                    # Translation instructions already include base_instructions
+                    instructions = translation_instructions + search_instructions
+                else:
+                    instructions = base_instructions + search_instructions
+            else:
+                # No web search
+                if translation_instructions:
+                    instructions = translation_instructions
+                else:
+                    instructions = base_instructions
+            print(f"[NORMAL MODE] Base instructions: {base_instructions[:100]}...")
+            print(f"[NORMAL MODE] Translation instructions: {translation_instructions[:200] if translation_instructions else 'None'}...")
+            print(f"[NORMAL MODE] Combined instructions length: {len(instructions)}")
+            print(f"[NORMAL MODE] Target language: {self.target_language}")
+            async with self.client.beta.realtime.connect(
+                model="gpt-4o-mini-realtime-preview-2024-12-17"
+            ) as conn:
+                # Update session with tools
+                session_update = {
+                    "turn_detection": {"type": "server_vad"},
+                    "instructions": instructions,
+                    "tools": tools,
+                    "tool_choice": "auto" if tools else "none",
+                    "temperature": 0.7,
+                    "max_response_output_tokens": 4096,
+                    "modalities": ["text", "audio"],
+                    "voice": "alloy"  # Default voice
+                }
+                # Use appropriate voice for the language
+                if self.target_language:
+                    # Force language through multiple mechanisms
+                    # 1. Use voice that's known to work well with the language
+                    voice_map = {
+                        "en": "nova",      # Nova has clearer English
+                        "es": "nova",      # Nova works for Spanish
+                        "fr": "shimmer",   # Shimmer for French
+                        "de": "echo",      # Echo for German
+                        "ja": "alloy",     # Alloy can do Japanese
+                        "zh": "alloy",     # Alloy can do Chinese
+                        "ko": "nova",      # Nova for Korean
                     }
+                    session_update["voice"] = voice_map.get(self.target_language, "nova")
+                    # 2. Add language to modalities (experimental)
+                    session_update["modalities"] = ["text", "audio"]
+                    # 3. Set output format
+                    session_update["output_audio_format"] = "pcm16"
+                    # 4. Add language hint to the system (if supported by API)
+                    if self.target_language in ["en", "es", "fr", "de", "ja", "zh"]:
+                        session_update["language"] = self.target_language  # Try setting language directly
+                    print(f"[TRANSLATION MODE] Session update: {json.dumps(session_update, indent=2)}")
+                await conn.session.update(session=session_update)
+                self.connection = conn
+                print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
+                async for event in self.connection:
+                    # Debug logging for function calls
+                    if event.type.startswith("response.function_call"):
+                        print(f"Function event: {event.type}")
+                    if event.type == "response.audio_transcript.done":
+                        print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
+                        print(f"[RESPONSE] Expected language: {self.target_language}")
+                        output_data = {
+                            "event": event,
+                            "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
+                        }
+                        await self.output_queue.put(AdditionalOutputs(output_data))
+                    elif event.type == "response.audio.delta":
+                        await self.output_queue.put(
+                            (
+                                self.output_sample_rate,
+                                np.frombuffer(
+                                    base64.b64decode(event.delta), dtype=np.int16
+                                ).reshape(1, -1),
+                            ),
+                        )
+                    # Handle function calls (only in non-interpretation mode)
+                    elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
+                        print(f"Function call started")
+                        self.function_call_in_progress = True
+                        self.current_function_args = ""
+                        self.current_call_id = getattr(event, 'call_id', None)
+                    elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
+                        if self.function_call_in_progress:
+                            self.current_function_args += event.delta
+                    elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
+                        if self.function_call_in_progress:
+                            print(f"Function call done, args: {self.current_function_args}")
+                            try:
+                                args = json.loads(self.current_function_args)
+                                query = args.get("query", "")
+                                # Emit search event to client
+                                await self.output_queue.put(AdditionalOutputs({
+                                    "type": "search",
+                                    "query": query
+                                }))
+                                # Perform the search
+                                search_results = await self.search_web(query)
+                                print(f"Search results length: {len(search_results)}")
+                                # Send function result back to the model
+                                if self.connection and self.current_call_id:
+                                    await self.connection.conversation.item.create(
+                                        item={
+                                            "type": "function_call_output",
+                                            "call_id": self.current_call_id,
+                                            "output": search_results
+                                        }
+                                    )
+                                    await self.connection.response.create()
+                            except Exception as e:
+                                print(f"Function call error: {e}")
+                            finally:
+                                self.function_call_in_progress = False
+                                self.current_function_args = ""
+                                self.current_call_id = None
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         if self.interpretation_mode: