Private-AI

Running

App Files Files Community

seawolf2357 commited on Jun 13

Commit

78b5e61

verified ·

1 Parent(s): 753be88

Update app.py

Browse files

Files changed (1) hide show

app.py +306 -233

app.py CHANGED Viewed

@@ -861,8 +861,8 @@ HTML_CONTENT = """<!DOCTYPE html>
             console.log('Selected language:', selectedLanguage);
         });
-        // Interpretation mode toggle
-        interpretationToggle.addEventListener('click', () => {
             if (!interpretationMode) {
                 // Turning ON interpretation mode
                 interpretationLanguagesContainer.style.display = 'block';
@@ -891,15 +891,22 @@ HTML_CONTENT = """<!DOCTYPE html>
                 textInput.placeholder = '통역 모드에서는 텍스트 입력이 지원되지 않습니다';
                 sendButton.style.display = 'none';
-                console.log('Interpretation mode enabled with languages:', interpretationLanguages);
-                // If connected, restart to apply interpretation mode
                 if (peerConnection && peerConnection.connectionState === 'connected') {
                     showError('통역 모드 설정을 적용하기 위해 연결을 다시 시작합니다.');
                     stop();
-                    setTimeout(() => {
-                        setupWebRTC();
-                    }, 500);
                 }
             } else {
                 // Turning OFF interpretation mode
@@ -916,19 +923,26 @@ HTML_CONTENT = """<!DOCTYPE html>
                 textInput.placeholder = '텍스트 메시지를 입력하세요...';
                 sendButton.style.display = 'block';
-                console.log('Interpretation mode disabled');
                 // If connected, restart to apply normal mode
                 if (peerConnection && peerConnection.connectionState === 'connected') {
                     showError('일반 모드로 전환하기 위해 연결을 다시 시작합니다.');
                     stop();
-                    setTimeout(() => {
-                        setupWebRTC();
-                    }, 500);
                 }
             }
-            console.log('Interpretation mode:', interpretationMode);
         });
         // System prompt update
@@ -1284,16 +1298,33 @@ HTML_CONTENT = """<!DOCTYPE html>
         }
         function stop() {
             if (animationFrame) {
                 cancelAnimationFrame(animationFrame);
             }
             if (audioContext) {
                 audioContext.close();
                 audioContext = null;
                 analyser = null;
                 audioSource = null;
             }
             if (peerConnection) {
                 if (peerConnection.getTransceivers) {
                     peerConnection.getTransceivers().forEach(transceiver => {
                         if (transceiver.stop) {
@@ -1301,17 +1332,46 @@ HTML_CONTENT = """<!DOCTYPE html>
                         }
                     });
                 }
                 if (peerConnection.getSenders) {
                     peerConnection.getSenders().forEach(sender => {
-                        if (sender.track && sender.track.stop) sender.track.stop();
                     });
                 }
-                console.log('closing');
                 peerConnection.close();
             }
-            dataChannel = null;
-            updateButtonState();
             audioLevel = 0;
         }
         startButton.addEventListener('click', () => {
             console.log('clicked');
@@ -1534,9 +1594,8 @@ class OpenAIHandler(AsyncStreamHandler):
         self.silence_threshold = 20  # Reduced for faster response (20 frames = ~0.4 seconds)
         self.min_audio_length = 10  # Minimum frames to consider as speech
-        print(f"Handler created with web_search_enabled={web_search_enabled}, "
-              f"target_language={target_language}, webrtc_id={webrtc_id}, "
-              f"interpretation_mode={interpretation_mode}, interpretation_languages={interpretation_languages}")
     def copy(self):
         # Get the most recent settings
@@ -1548,6 +1607,12 @@ class OpenAIHandler(AsyncStreamHandler):
             if recent_ids:
                 recent_id = recent_ids[0]
                 settings = connection_settings[recent_id]
                 return OpenAIHandler(
                     web_search_enabled=settings.get('web_search_enabled', False),
                     target_language=settings.get('target_language', ''),
@@ -1557,7 +1622,7 @@ class OpenAIHandler(AsyncStreamHandler):
                     interpretation_languages=settings.get('interpretation_languages', [])
                 )
-        print(f"Handler.copy() called - creating new handler with default settings")
         return OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
     async def search_web(self, query: str) -> str:
@@ -1772,52 +1837,47 @@ Direct translation to {target_lang_name}:"""
     async def start_up(self):
         """Connect to realtime API or setup interpretation mode"""
         # First check if we have the most recent settings
-        if connection_settings:
-            recent_ids = sorted(connection_settings.keys(),
-                              key=lambda k: connection_settings[k].get('timestamp', 0),
-                              reverse=True)
-            if recent_ids:
-                recent_id = recent_ids[0]
-                settings = connection_settings[recent_id]
                 self.web_search_enabled = settings.get('web_search_enabled', False)
                 self.target_language = settings.get('target_language', '')
                 self.system_prompt = settings.get('system_prompt', '')
                 self.interpretation_mode = settings.get('interpretation_mode', False)
                 self.interpretation_languages = settings.get('interpretation_languages', [])
-                self.webrtc_id = recent_id
-                print(f"start_up: Updated settings from storage - webrtc_id={self.webrtc_id}, "
-                      f"web_search_enabled={self.web_search_enabled}, target_language={self.target_language}, "
-                      f"interpretation_mode={self.interpretation_mode}")
-                print(f"Handler interpretation settings: mode={self.interpretation_mode}, languages={self.interpretation_languages}")
-        print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
-              f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
-              f"interpretation_languages={self.interpretation_languages}")
         self.client = openai.AsyncOpenAI()
         # If in interpretation mode, don't connect to Realtime API
         if self.interpretation_mode:
-            print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini (text only)")
             print(f"[INTERPRETATION MODE] Target languages: {self.interpretation_languages}")
             # Just keep the handler ready to process audio
-            # Keep the emit loop running for interpretation results
-            asyncio.create_task(self._interpretation_loop())
             return
         # Normal mode - connect to Realtime API
-        if not self.interpretation_mode:
-            # Define the web search function
-            tools = []
-            base_instructions = self.system_prompt or "You are a helpful assistant."
-            # Add translation instructions if language is selected
-            if self.target_language:
-                language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
-                # Use the target language for the system prompt itself
-                if self.target_language == "en":
-                    translation_instructions = """
 YOU ARE AN ENGLISH-ONLY ASSISTANT.
 ABSOLUTE RULES:
@@ -1831,11 +1891,11 @@ YOUR LANGUAGE MODE: ENGLISH ONLY
 DO NOT USE: 안녕하세요, 감사합니다, or any Korean
 ALWAYS USE: Hello, Thank you, and English words only
 """
-                    # Override base instructions to be in English
-                    base_instructions = "You are a helpful assistant that speaks ONLY English."
-                elif self.target_language == "ja":
-                    translation_instructions = """
 あなたは日本語のみを話すアシスタントです。
 絶対的なルール：
@@ -1849,10 +1909,10 @@ ALWAYS USE: Hello, Thank you, and English words only
 使用禁止：안녕하세요、감사합니다、韓国語全般
 必ず使用：こんにちは、ありがとうございます、日本語のみ
 """
-                    base_instructions = "あなたは日本語のみを話す親切なアシスタントです。"
-                elif self.target_language == "zh":
-                    translation_instructions = """
 你是一个只说中文的助手。
 绝对规则：
@@ -1866,10 +1926,10 @@ ALWAYS USE: Hello, Thank you, and English words only
 禁止使用：안녕하세요、감사합니다、任何韩语
 必须使用：你好、谢谢、只用中文
 """
-                    base_instructions = "你是一个只说中文的友好助手。"
-                elif self.target_language == "es":
-                    translation_instructions = """
 ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
 REGLAS ABSOLUTAS:
@@ -1883,9 +1943,9 @@ MODO DE IDIOMA: SOLO ESPAÑOL
 NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
 SIEMPRE USAR: Hola, Gracias, y solo palabras en español
 """
-                    base_instructions = "Eres un asistente útil que habla SOLO español."
-                else:
-                    translation_instructions = f"""
 YOU MUST ONLY SPEAK {language_name.upper()}.
 RULES:
@@ -1893,181 +1953,182 @@ RULES:
 2. Never use Korean
 3. Always respond in {language_name}
 """
-                    base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}."
             else:
-                translation_instructions = ""
-            if self.web_search_enabled and self.search_client:
-                tools = [{
-                    "type": "function",
-                    "function": {
-                        "name": "web_search",
-                        "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
-                        "parameters": {
-                            "type": "object",
-                            "properties": {
-                                "query": {
-                                    "type": "string",
-                                    "description": "The search query"
-                                }
-                            },
-                            "required": ["query"]
-                        }
-                    }
-                }]
-                print("Web search function added to tools")
-                search_instructions = (
-                    "\n\nYou have web search capabilities. "
-                    "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
-                    "- Weather (날씨, 기온, 비, 눈)\n"
-                    "- News (뉴스, 소식)\n"
-                    "- Current events (현재, 최근, 오늘, 지금)\n"
-                    "- Prices (가격, 환율, 주가)\n"
-                    "- Sports scores or results\n"
-                    "- Any question about 2024 or 2025\n"
-                    "- Any time-sensitive information\n\n"
-                    "When in doubt, USE web_search. It's better to search and provide accurate information "
-                    "than to guess or use outdated information."
-                )
-                # Combine all instructions
-                if translation_instructions:
-                    # Translation instructions already include base_instructions
-                    instructions = translation_instructions + search_instructions
-                else:
-                    instructions = base_instructions + search_instructions
-            else:
-                # No web search
-                if translation_instructions:
-                    instructions = translation_instructions
-                else:
-                    instructions = base_instructions
-            print(f"[NORMAL MODE] Base instructions: {base_instructions[:100]}...")
-            print(f"[NORMAL MODE] Translation instructions: {translation_instructions[:200] if translation_instructions else 'None'}...")
-            print(f"[NORMAL MODE] Combined instructions length: {len(instructions)}")
-            print(f"[NORMAL MODE] Target language: {self.target_language}")
-            async with self.client.beta.realtime.connect(
-                model="gpt-4o-mini-realtime-preview-2024-12-17"
-            ) as conn:
-                # Update session with tools
-                session_update = {
-                    "turn_detection": {"type": "server_vad"},
-                    "instructions": instructions,
-                    "tools": tools,
-                    "tool_choice": "auto" if tools else "none",
-                    "temperature": 0.7,
-                    "max_response_output_tokens": 4096,
-                    "modalities": ["text", "audio"],
-                    "voice": "alloy"  # Default voice
-                }
-                # Use appropriate voice for the language
-                if self.target_language:
-                    # Force language through multiple mechanisms
-                    # 1. Use voice that's known to work well with the language
-                    voice_map = {
-                        "en": "nova",      # Nova has clearer English
-                        "es": "nova",      # Nova works for Spanish
-                        "fr": "shimmer",   # Shimmer for French
-                        "de": "echo",      # Echo for German
-                        "ja": "alloy",     # Alloy can do Japanese
-                        "zh": "alloy",     # Alloy can do Chinese
-                        "ko": "nova",      # Nova for Korean
-                    }
-                    session_update["voice"] = voice_map.get(self.target_language, "nova")
-                    # 2. Add language to modalities (experimental)
-                    session_update["modalities"] = ["text", "audio"]
-                    # 3. Set output format
-                    session_update["output_audio_format"] = "pcm16"
-                    # 4. Add language hint to the system (if supported by API)
-                    if self.target_language in ["en", "es", "fr", "de", "ja", "zh"]:
-                        session_update["language"] = self.target_language  # Try setting language directly
-                    print(f"[TRANSLATION MODE] Session update: {json.dumps(session_update, indent=2)}")
-                await conn.session.update(session=session_update)
-                self.connection = conn
-                print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
-                async for event in self.connection:
-                    # Debug logging for function calls
-                    if event.type.startswith("response.function_call"):
-                        print(f"Function event: {event.type}")
-                    if event.type == "response.audio_transcript.done":
-                        print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
-                        print(f"[RESPONSE] Expected language: {self.target_language}")
-                        output_data = {
-                            "event": event,
-                            "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
-                        }
-                        await self.output_queue.put(AdditionalOutputs(output_data))
-                    elif event.type == "response.audio.delta":
-                        await self.output_queue.put(
-                            (
-                                self.output_sample_rate,
-                                np.frombuffer(
-                                    base64.b64decode(event.delta), dtype=np.int16
-                                ).reshape(1, -1),
-                            ),
-                        )
-                    # Handle function calls (only in non-interpretation mode)
-                    elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
-                        print(f"Function call started")
-                        self.function_call_in_progress = True
-                        self.current_function_args = ""
-                        self.current_call_id = getattr(event, 'call_id', None)
-                    elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
-                        if self.function_call_in_progress:
-                            self.current_function_args += event.delta
-                    elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
-                        if self.function_call_in_progress:
-                            print(f"Function call done, args: {self.current_function_args}")
-                            try:
-                                args = json.loads(self.current_function_args)
-                                query = args.get("query", "")
-                                # Emit search event to client
-                                await self.output_queue.put(AdditionalOutputs({
-                                    "type": "search",
-                                    "query": query
-                                }))
-                                # Perform the search
-                                search_results = await self.search_web(query)
-                                print(f"Search results length: {len(search_results)}")
-                                # Send function result back to the model
-                                if self.connection and self.current_call_id:
-                                    await self.connection.conversation.item.create(
-                                        item={
-                                            "type": "function_call_output",
-                                            "call_id": self.current_call_id,
-                                            "output": search_results
-                                        }
-                                    )
-                                    await self.connection.response.create()
-                            except Exception as e:
-                                print(f"Function call error: {e}")
-                            finally:
-                                self.function_call_in_progress = False
-                                self.current_function_args = ""
-                                self.current_call_id = None
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         if self.interpretation_mode:
             # In interpretation mode, buffer audio and process with Whisper
             _, array = frame
             array = array.squeeze()
@@ -2093,6 +2154,7 @@ RULES:
         else:
             # Normal mode - use Realtime API
             if not self.connection:
                 return
             try:
                 _, array = frame
@@ -2101,16 +2163,16 @@ RULES:
                 await self.connection.input_audio_buffer.append(audio=audio_message)
             except Exception as e:
                 print(f"Error in receive: {e}")
-                # Connection might be closed, ignore the error
     async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
-        # In interpretation mode, we need to keep checking for audio
         if self.interpretation_mode:
-            # Use a timeout to prevent blocking forever
             try:
-                item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.1)
                 return item
             except asyncio.TimeoutError:
                 return None
         else:
             # Normal mode
@@ -2124,6 +2186,8 @@ RULES:
             return item
     async def shutdown(self) -> None:
         if self.interpretation_mode:
             # Clean up interpretation mode
             self.audio_buffer = []
@@ -2134,6 +2198,7 @@ RULES:
             if self.connection:
                 await self.connection.close()
                 self.connection = None
 # Create initial handler instance
@@ -2173,9 +2238,11 @@ async def custom_offer(request: Request):
     interpretation_mode = body.get("interpretation_mode", False)
     interpretation_languages = body.get("interpretation_languages", [])
-    print(f"Custom offer - webrtc_id: {webrtc_id}, web_search_enabled: {web_search_enabled}, "
-          f"target_language: {target_language}, interpretation_mode: {interpretation_mode}, "
-          f"interpretation_languages: {interpretation_languages}")
     # Store settings with timestamp
     if webrtc_id:
@@ -2187,6 +2254,9 @@ async def custom_offer(request: Request):
             'interpretation_languages': interpretation_languages,
             'timestamp': asyncio.get_event_loop().time()
         }
     # Remove our custom route temporarily
     custom_route = None
@@ -2196,12 +2266,15 @@ async def custom_offer(request: Request):
             break
     # Forward to stream's offer handler
     response = await stream.offer(body)
     # Re-add our custom route
     if custom_route:
         app.routes.insert(0, custom_route)
     return response

             console.log('Selected language:', selectedLanguage);
         });
+        // Interpretation mode toggle - 수정된 버전
+        interpretationToggle.addEventListener('click', async () => {
             if (!interpretationMode) {
                 // Turning ON interpretation mode
                 interpretationLanguagesContainer.style.display = 'block';
                 textInput.placeholder = '통역 모드에서는 텍스트 입력이 지원되지 않습니다';
                 sendButton.style.display = 'none';
+                console.log('[FRONTEND] Interpretation mode enabled with languages:', interpretationLanguages);
+                // If connected, stop and restart with interpretation mode
                 if (peerConnection && peerConnection.connectionState === 'connected') {
+                    console.log('[FRONTEND] Stopping current connection for interpretation mode');
                     showError('통역 모드 설정을 적용하기 위해 연결을 다시 시작합니다.');
+                    // Force stop the connection
                     stop();
+                    // Wait a bit longer to ensure cleanup
+                    await new Promise(resolve => setTimeout(resolve, 1000));
+                    // Start new connection with interpretation mode
+                    console.log('[FRONTEND] Starting new connection with interpretation mode');
+                    setupWebRTC();
                 }
             } else {
                 // Turning OFF interpretation mode
                 textInput.placeholder = '텍스트 메시지를 입력하세요...';
                 sendButton.style.display = 'block';
+                console.log('[FRONTEND] Interpretation mode disabled');
                 // If connected, restart to apply normal mode
                 if (peerConnection && peerConnection.connectionState === 'connected') {
+                    console.log('[FRONTEND] Stopping current connection for normal mode');
                     showError('일반 모드로 전환하기 위해 연결을 다시 시작합니다.');
+                    // Force stop the connection
                     stop();
+                    // Wait a bit longer to ensure cleanup
+                    await new Promise(resolve => setTimeout(resolve, 1000));
+                    // Start new connection with normal mode
+                    console.log('[FRONTEND] Starting new connection with normal mode');
+                    setupWebRTC();
                 }
             }
+            console.log('[FRONTEND] Final interpretation mode:', interpretationMode);
         });
         // System prompt update
         }
         function stop() {
+            console.log('[STOP] Stopping connection...');
+            // Cancel animation frame first
             if (animationFrame) {
                 cancelAnimationFrame(animationFrame);
+                animationFrame = null;
             }
+            // Close audio context
             if (audioContext) {
                 audioContext.close();
                 audioContext = null;
                 analyser = null;
                 audioSource = null;
             }
+            // Close data channel
+            if (dataChannel) {
+                dataChannel.close();
+                dataChannel = null;
+            }
+            // Close peer connection
             if (peerConnection) {
+                console.log('[STOP] Current connection state:', peerConnection.connectionState);
+                // Stop all transceivers
                 if (peerConnection.getTransceivers) {
                     peerConnection.getTransceivers().forEach(transceiver => {
                         if (transceiver.stop) {
                         }
                     });
                 }
+                // Stop all senders
                 if (peerConnection.getSenders) {
                     peerConnection.getSenders().forEach(sender => {
+                        if (sender.track) {
+                            sender.track.stop();
+                        }
+                    });
+                }
+                // Stop all receivers
+                if (peerConnection.getReceivers) {
+                    peerConnection.getReceivers().forEach(receiver => {
+                        if (receiver.track) {
+                            receiver.track.stop();
+                        }
                     });
                 }
+                // Close the connection
                 peerConnection.close();
+                // Clear the reference
+                peerConnection = null;
+                console.log('[STOP] Connection closed');
             }
+            // Reset audio level
             audioLevel = 0;
+            isVoiceActive = false;
+            // Update UI
+            updateButtonState();
+            // Clear any existing webrtc_id
+            if (webrtc_id) {
+                console.log('[STOP] Clearing webrtc_id:', webrtc_id);
+                webrtc_id = null;
+            }
         }
         startButton.addEventListener('click', () => {
             console.log('clicked');
         self.silence_threshold = 20  # Reduced for faster response (20 frames = ~0.4 seconds)
         self.min_audio_length = 10  # Minimum frames to consider as speech
+        print(f"[INIT] Handler created with interpretation_mode={interpretation_mode}, "
+              f"interpretation_languages={interpretation_languages}")
     def copy(self):
         # Get the most recent settings
             if recent_ids:
                 recent_id = recent_ids[0]
                 settings = connection_settings[recent_id]
+                # Log the settings being copied
+                print(f"[COPY] Copying settings from {recent_id}:")
+                print(f"[COPY] interpretation_mode={settings.get('interpretation_mode', False)}")
+                print(f"[COPY] interpretation_languages={settings.get('interpretation_languages', [])}")
                 return OpenAIHandler(
                     web_search_enabled=settings.get('web_search_enabled', False),
                     target_language=settings.get('target_language', ''),
                     interpretation_languages=settings.get('interpretation_languages', [])
                 )
+        print(f"[COPY] No settings found, creating default handler")
         return OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
     async def search_web(self, query: str) -> str:
     async def start_up(self):
         """Connect to realtime API or setup interpretation mode"""
         # First check if we have the most recent settings
+        if connection_settings and self.webrtc_id:
+            if self.webrtc_id in connection_settings:
+                settings = connection_settings[self.webrtc_id]
                 self.web_search_enabled = settings.get('web_search_enabled', False)
                 self.target_language = settings.get('target_language', '')
                 self.system_prompt = settings.get('system_prompt', '')
                 self.interpretation_mode = settings.get('interpretation_mode', False)
                 self.interpretation_languages = settings.get('interpretation_languages', [])
+                print(f"[START_UP] Updated settings from storage for {self.webrtc_id}")
+                print(f"[START_UP] interpretation_mode={self.interpretation_mode}")
+                print(f"[START_UP] interpretation_languages={self.interpretation_languages}")
+        print(f"[START_UP] Starting with interpretation_mode={self.interpretation_mode}")
         self.client = openai.AsyncOpenAI()
         # If in interpretation mode, don't connect to Realtime API
         if self.interpretation_mode:
+            print(f"[INTERPRETATION MODE] Active - Skipping Realtime API connection")
+            print(f"[INTERPRETATION MODE] Using Whisper + GPT-4o-mini (text only)")
             print(f"[INTERPRETATION MODE] Target languages: {self.interpretation_languages}")
+            # Do NOT connect to Realtime API
             # Just keep the handler ready to process audio
             return
         # Normal mode - connect to Realtime API
+        print(f"[NORMAL MODE] Connecting to Realtime API...")
+        # Define the web search function
+        tools = []
+        base_instructions = self.system_prompt or "You are a helpful assistant."
+        # Add translation instructions if language is selected
+        if self.target_language:
+            language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
+            # Use the target language for the system prompt itself
+            if self.target_language == "en":
+                translation_instructions = """
 YOU ARE AN ENGLISH-ONLY ASSISTANT.
 ABSOLUTE RULES:
 DO NOT USE: 안녕하세요, 감사합니다, or any Korean
 ALWAYS USE: Hello, Thank you, and English words only
 """
+                # Override base instructions to be in English
+                base_instructions = "You are a helpful assistant that speaks ONLY English."
+            elif self.target_language == "ja":
+                translation_instructions = """
 あなたは日本語のみを話すアシスタントです。
 絶対的なルール：
 使用禁止：안녕하세요、감사합니다、韓国語全般
 必ず使用：こんにちは、ありがとうございます、日本語のみ
 """
+                base_instructions = "あなたは日本語のみを話す親切なアシスタントです。"
+            elif self.target_language == "zh":
+                translation_instructions = """
 你是一个只说中文的助手。
 绝对规则：
 禁止使用：안녕하세요、감사합니다、任何韩语
 必须使用：你好、谢谢、只用中文
 """
+                base_instructions = "你是一个只说中文的友好助手。"
+            elif self.target_language == "es":
+                translation_instructions = """
 ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
 REGLAS ABSOLUTAS:
 NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
 SIEMPRE USAR: Hola, Gracias, y solo palabras en español
 """
+                base_instructions = "Eres un asistente útil que habla SOLO español."
+            else:
+                translation_instructions = f"""
 YOU MUST ONLY SPEAK {language_name.upper()}.
 RULES:
 2. Never use Korean
 3. Always respond in {language_name}
 """
+                base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}."
+        else:
+            translation_instructions = ""
+        if self.web_search_enabled and self.search_client:
+            tools = [{
+                "type": "function",
+                "function": {
+                    "name": "web_search",
+                    "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "query": {
+                                "type": "string",
+                                "description": "The search query"
+                            }
+                        },
+                        "required": ["query"]
+                    }
+                }
+            }]
+            print("Web search function added to tools")
+            search_instructions = (
+                "\n\nYou have web search capabilities. "
+                "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
+                "- Weather (날씨, 기온, 비, 눈)\n"
+                "- News (뉴스, 소식)\n"
+                "- Current events (현재, 최근, 오늘, 지금)\n"
+                "- Prices (가격, 환율, 주가)\n"
+                "- Sports scores or results\n"
+                "- Any question about 2024 or 2025\n"
+                "- Any time-sensitive information\n\n"
+                "When in doubt, USE web_search. It's better to search and provide accurate information "
+                "than to guess or use outdated information."
+            )
+            # Combine all instructions
+            if translation_instructions:
+                # Translation instructions already include base_instructions
+                instructions = translation_instructions + search_instructions
             else:
+                instructions = base_instructions + search_instructions
+        else:
+            # No web search
+            if translation_instructions:
+                instructions = translation_instructions
+            else:
+                instructions = base_instructions
+        print(f"[NORMAL MODE] Base instructions: {base_instructions[:100]}...")
+        print(f"[NORMAL MODE] Translation instructions: {translation_instructions[:200] if translation_instructions else 'None'}...")
+        print(f"[NORMAL MODE] Combined instructions length: {len(instructions)}")
+        print(f"[NORMAL MODE] Target language: {self.target_language}")
+        async with self.client.beta.realtime.connect(
+            model="gpt-4o-mini-realtime-preview-2024-12-17"
+        ) as conn:
+            # Update session with tools
+            session_update = {
+                "turn_detection": {"type": "server_vad"},
+                "instructions": instructions,
+                "tools": tools,
+                "tool_choice": "auto" if tools else "none",
+                "temperature": 0.7,
+                "max_response_output_tokens": 4096,
+                "modalities": ["text", "audio"],
+                "voice": "alloy"  # Default voice
+            }
+            # Use appropriate voice for the language
+            if self.target_language:
+                # Force language through multiple mechanisms
+                # 1. Use voice that's known to work well with the language
+                voice_map = {
+                    "en": "nova",      # Nova has clearer English
+                    "es": "nova",      # Nova works for Spanish
+                    "fr": "shimmer",   # Shimmer for French
+                    "de": "echo",      # Echo for German
+                    "ja": "alloy",     # Alloy can do Japanese
+                    "zh": "alloy",     # Alloy can do Chinese
+                    "ko": "nova",      # Nova for Korean
+                }
+                session_update["voice"] = voice_map.get(self.target_language, "nova")
+                # 2. Add language to modalities (experimental)
+                session_update["modalities"] = ["text", "audio"]
+                # 3. Set output format
+                session_update["output_audio_format"] = "pcm16"
+                # 4. Add language hint to the system (if supported by API)
+                if self.target_language in ["en", "es", "fr", "de", "ja", "zh"]:
+                    session_update["language"] = self.target_language  # Try setting language directly
+                print(f"[TRANSLATION MODE] Session update: {json.dumps(session_update, indent=2)}")
+            await conn.session.update(session=session_update)
+            self.connection = conn
+            print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
+            async for event in self.connection:
+                # Debug logging for function calls
+                if event.type.startswith("response.function_call"):
+                    print(f"Function event: {event.type}")
+                if event.type == "response.audio_transcript.done":
+                    print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
+                    print(f"[RESPONSE] Expected language: {self.target_language}")
+                    output_data = {
+                        "event": event,
+                        "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
+                    }
+                    await self.output_queue.put(AdditionalOutputs(output_data))
+                elif event.type == "response.audio.delta":
+                    await self.output_queue.put(
+                        (
+                            self.output_sample_rate,
+                            np.frombuffer(
+                                base64.b64decode(event.delta), dtype=np.int16
+                            ).reshape(1, -1),
+                        ),
+                    )
+                # Handle function calls (only in non-interpretation mode)
+                elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
+                    print(f"Function call started")
+                    self.function_call_in_progress = True
+                    self.current_function_args = ""
+                    self.current_call_id = getattr(event, 'call_id', None)
+                elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
+                    if self.function_call_in_progress:
+                        self.current_function_args += event.delta
+                elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
+                    if self.function_call_in_progress:
+                        print(f"Function call done, args: {self.current_function_args}")
+                        try:
+                            args = json.loads(self.current_function_args)
+                            query = args.get("query", "")
+                            # Emit search event to client
+                            await self.output_queue.put(AdditionalOutputs({
+                                "type": "search",
+                                "query": query
+                            }))
+                            # Perform the search
+                            search_results = await self.search_web(query)
+                            print(f"Search results length: {len(search_results)}")
+                            # Send function result back to the model
+                            if self.connection and self.current_call_id:
+                                await self.connection.conversation.item.create(
+                                    item={
+                                        "type": "function_call_output",
+                                        "call_id": self.current_call_id,
+                                        "output": search_results
+                                    }
+                                )
+                                await self.connection.response.create()
+                        except Exception as e:
+                            print(f"Function call error: {e}")
+                        finally:
+                            self.function_call_in_progress = False
+                            self.current_function_args = ""
+                            self.current_call_id = None
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         if self.interpretation_mode:
+            print(f"[RECEIVE] Processing in interpretation mode")
             # In interpretation mode, buffer audio and process with Whisper
             _, array = frame
             array = array.squeeze()
         else:
             # Normal mode - use Realtime API
             if not self.connection:
+                print(f"[RECEIVE] No connection in normal mode, skipping")
                 return
             try:
                 _, array = frame
                 await self.connection.input_audio_buffer.append(audio=audio_message)
             except Exception as e:
                 print(f"Error in receive: {e}")
     async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
+        # In interpretation mode, we need to keep checking for outputs
         if self.interpretation_mode:
+            # Use a short timeout to prevent blocking
             try:
+                item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.05)
                 return item
             except asyncio.TimeoutError:
+                # Return None to keep the stream alive
                 return None
         else:
             # Normal mode
             return item
     async def shutdown(self) -> None:
+        print(f"[SHUTDOWN] Called with interpretation_mode={self.interpretation_mode}")
         if self.interpretation_mode:
             # Clean up interpretation mode
             self.audio_buffer = []
             if self.connection:
                 await self.connection.close()
                 self.connection = None
+                print("[NORMAL MODE] Connection closed")
 # Create initial handler instance
     interpretation_mode = body.get("interpretation_mode", False)
     interpretation_languages = body.get("interpretation_languages", [])
+    print(f"[OFFER] Received offer with webrtc_id: {webrtc_id}")
+    print(f"[OFFER] interpretation_mode: {interpretation_mode}")
+    print(f"[OFFER] interpretation_languages: {interpretation_languages}")
+    print(f"[OFFER] web_search_enabled: {web_search_enabled}")
+    print(f"[OFFER] target_language: {target_language}")
     # Store settings with timestamp
     if webrtc_id:
             'interpretation_languages': interpretation_languages,
             'timestamp': asyncio.get_event_loop().time()
         }
+        print(f"[OFFER] Stored settings for {webrtc_id}:")
+        print(f"[OFFER] {connection_settings[webrtc_id]}")
     # Remove our custom route temporarily
     custom_route = None
             break
     # Forward to stream's offer handler
+    print(f"[OFFER] Forwarding to stream.offer()")
     response = await stream.offer(body)
     # Re-add our custom route
     if custom_route:
         app.routes.insert(0, custom_route)
+    print(f"[OFFER] Response status: {response.get('status', 'unknown') if isinstance(response, dict) else 'OK'}")
     return response