Private-AI

Running

App Files Files Community

seawolf2357 commited on 23 days ago

Commit

24f7b71

verified ·

1 Parent(s): 17b6035

Update app.py

Browse files

Files changed (1) hide show

app.py +264 -160

app.py CHANGED Viewed

@@ -20,6 +20,9 @@ from openai.types.beta.realtime import ResponseAudioTranscriptDoneEvent
 import httpx
 from typing import Optional, List, Dict
 import gradio as gr
 load_dotenv()
@@ -315,6 +318,15 @@ HTML_CONTENT = """<!DOCTYPE html>
             padding: 10px;
             margin-bottom: 10px;
         }
         .controls {
             text-align: center;
             margin-top: auto;
@@ -618,7 +630,10 @@ HTML_CONTENT = """<!DOCTYPE html>
                         </div>
                     </div>
                     <div class="interpretation-info" id="interpretation-info" style="display: none;">
-                        통역 모드: 입력한 음성이 선택한 언어로 자동 통역됩니다.
                     </div>
                     <div class="text-input-section">
                         <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
@@ -1049,6 +1064,16 @@ HTML_CONTENT = """<!DOCTYPE html>
                     if (selectedLanguage && eventJson.language) {
                         content += ` <span class="language-info">[${eventJson.language}]</span>`;
                     } else if (interpretationMode && eventJson.language) {
                         content += ` <span class="language-info">[통역: ${eventJson.language}]</span>`;
                     }
                     addMessage("assistant", content);
@@ -1069,6 +1094,12 @@ HTML_CONTENT = """<!DOCTYPE html>
         function addMessage(role, content) {
             const messageDiv = document.createElement('div');
             messageDiv.classList.add('message', role);
             if (content.includes('<span')) {
                 messageDiv.innerHTML = content;
             } else {
@@ -1273,6 +1304,13 @@ class OpenAIHandler(AsyncStreamHandler):
         self.system_prompt = system_prompt
         self.interpretation_mode = interpretation_mode
         self.interpretation_language = interpretation_language
         print(f"Handler created with web_search_enabled={web_search_enabled}, "
               f"target_language={target_language}, webrtc_id={webrtc_id}, "
               f"interpretation_mode={interpretation_mode}, interpretation_language={interpretation_language}")
@@ -1297,7 +1335,7 @@ class OpenAIHandler(AsyncStreamHandler):
                 )
         print(f"Handler.copy() called - creating new handler with default settings")
-        return OpenAIHandler(web_search_enabled=False)
     async def search_web(self, query: str) -> str:
         """Perform web search and return formatted results"""
@@ -1332,31 +1370,122 @@ class OpenAIHandler(AsyncStreamHandler):
             )
             await self.connection.response.create()
-    def get_interpretation_instructions(self):
-        """Get instructions for interpretation mode"""
-        if not self.interpretation_mode or not self.interpretation_language:
-            return ""
-        target_language_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
-        target_code = self.interpretation_language
-        return (
-            f"YOU ARE IN TRANSLATION MODE. YOUR ONLY FUNCTION IS TO TRANSLATE.\n\n"
-            f"RULES:\n"
-            f"1. TRANSLATE the input to {target_language_name} ({target_code})\n"
-            f"2. OUTPUT ONLY THE TRANSLATION\n"
-            f"3. DO NOT ANSWER QUESTIONS\n"
-            f"4. DO NOT PROVIDE INFORMATION\n"
-            f"5. DO NOT CONTINUE SPEAKING\n"
-            f"6. NEVER OUTPUT IN ANY OTHER LANGUAGE THAN {target_language_name}\n\n"
-            f"EXAMPLES:\n"
-            f"Input: '안녕하세요' → Output: 'Hello' (if English)\n"
-            f"Input: '날씨가 어때요?' → Output: 'How is the weather?' (if English)\n"
-            f"Input: '커피 한 잔 주세요' → Output: 'One coffee please' (if English)\n\n"
-            f"YOU ARE NOT AN AI ASSISTANT. YOU ARE A TRANSLATOR.\n"
-            f"TRANSLATE AND STOP. NO ADDITIONAL WORDS.\n"
-            f"OUTPUT LANGUAGE: {target_language_name} ONLY."
-        )
     def get_translation_instructions(self):
         """Get instructions for translation based on target language"""
@@ -1371,7 +1500,7 @@ class OpenAIHandler(AsyncStreamHandler):
         )
     async def start_up(self):
-        """Connect to realtime API with function calling enabled"""
         # First check if we have the most recent settings
         if connection_settings:
             recent_ids = sorted(connection_settings.keys(),
@@ -1394,147 +1523,91 @@ class OpenAIHandler(AsyncStreamHandler):
         print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
               f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
               f"interpretation_language={self.interpretation_language}")
         self.client = openai.AsyncOpenAI()
         # Define the web search function
         tools = []
         base_instructions = self.system_prompt or "You are a helpful assistant."
-        # Check if in interpretation mode
-        if self.interpretation_mode:
-            # In interpretation mode, completely override everything
-            instructions = (
-                f"CRITICAL: YOU ARE A TRANSLATION SERVICE.\n"
-                f"TRANSLATE USER INPUT TO {SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)}.\n"
-                f"OUTPUT ONLY THE TRANSLATION. NOTHING ELSE.\n"
-                f"DO NOT ACT AS AN AI. DO NOT ANSWER QUESTIONS.\n"
-                f"JUST TRANSLATE AND STOP."
             )
-            # No tools in interpretation mode
-            tools = []
-            print(f"Interpretation mode active - target language: {self.interpretation_language}")
-            print(f"Instructions: {instructions}")
         else:
-            # Normal mode - add translation instructions if language is selected
-            translation_instructions = self.get_translation_instructions()
-            if self.web_search_enabled and self.search_client:
-                tools = [{
-                    "type": "function",
-                    "function": {
-                        "name": "web_search",
-                        "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
-                        "parameters": {
-                            "type": "object",
-                            "properties": {
-                                "query": {
-                                    "type": "string",
-                                    "description": "The search query"
-                                }
-                            },
-                            "required": ["query"]
-                        }
-                    }
-                }]
-                print("Web search function added to tools")
-                search_instructions = (
-                    "\n\nYou have web search capabilities. "
-                    "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
-                    "- Weather (날씨, 기온, 비, 눈)\n"
-                    "- News (뉴스, 소식)\n"
-                    "- Current events (현재, 최근, 오늘, 지금)\n"
-                    "- Prices (가격, 환율, 주가)\n"
-                    "- Sports scores or results\n"
-                    "- Any question about 2024 or 2025\n"
-                    "- Any time-sensitive information\n\n"
-                    "When in doubt, USE web_search. It's better to search and provide accurate information "
-                    "than to guess or use outdated information."
-                )
-                instructions = base_instructions + search_instructions + translation_instructions
-            else:
-                instructions = base_instructions + translation_instructions
         async with self.client.beta.realtime.connect(
             model="gpt-4o-mini-realtime-preview-2024-12-17"
         ) as conn:
             # Update session with tools
             session_update = {
-                "turn_detection": {
-                    "type": "server_vad",
-                    "threshold": 0.5,
-                    "prefix_padding_ms": 300,
-                    "silence_duration_ms": 100 if self.interpretation_mode else 700  # Even shorter silence
-                },
                 "instructions": instructions,
                 "tools": tools,
-                "tool_choice": "none" if self.interpretation_mode else ("auto" if tools else "none"),
-                "temperature": 0.0 if self.interpretation_mode else 0.7,
-                "max_response_output_tokens": 50 if self.interpretation_mode else 4096  # Very short responses
             }
-            # Add voice setting based on interpretation or translation language
-            voice_language = self.interpretation_language if self.interpretation_mode else self.target_language
-            # For interpretation mode, ensure proper settings
-            if self.interpretation_mode and self.interpretation_language:
-                # Force simple translation behavior
-                session_update["voice"] = "alloy"
-                session_update["modalities"] = ["audio", "text"]  # Changed order
-                # Create a very specific prompt for the target language
-                target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
-                # Use very explicit language instructions
-                if self.interpretation_language == "en":
-                    lang_specific = "Respond in English only. 영어로만 답하세요."
-                elif self.interpretation_language == "es":
-                    lang_specific = "Respond in Spanish only. Solo responde en español."
-                elif self.interpretation_language == "fr":
-                    lang_specific = "Respond in French only. Répondez uniquement en français."
-                elif self.interpretation_language == "de":
-                    lang_specific = "Respond in German only. Antworten Sie nur auf Deutsch."
-                elif self.interpretation_language == "ja":
-                    lang_specific = "Respond in Japanese only. 日本語でのみ答えてください。"
-                elif self.interpretation_language == "zh":
-                    lang_specific = "Respond in Chinese only. 只用中文回答。"
-                else:
-                    lang_specific = f"Respond in {target_lang_name} only."
-                # Override instructions with ultra-specific directive
-                session_update["instructions"] = (
-                    f"TRANSLATE TO {target_lang_name.upper()}. "
-                    f"{lang_specific} "
-                    f"You are a translation machine. "
-                    f"Input → Translation. Nothing else. "
-                    f"Do not chat. Do not explain. Just translate. "
-                    f"Maximum 20 words per response."
-                )
-                # Additional session parameters for interpretation mode
-                session_update["input_audio_transcription"] = {
-                    "model": "whisper-1"
                 }
-                print(f"[INTERPRETATION MODE] Target: {self.interpretation_language} ({target_lang_name})")
-                print(f"[INTERPRETATION MODE] Instructions: {session_update['instructions']}")
-                print(f"[INTERPRETATION MODE] Session config: {session_update}")
-            elif voice_language:
-                # Normal translation mode
-                session_update["voice"] = "alloy"
-                print(f"Voice set to: alloy for language: {voice_language}")
             await conn.session.update(session=session_update)
             self.connection = conn
-            print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}, "
-                  f"interpretation_mode: {self.interpretation_mode}, language: {self.interpretation_language if self.interpretation_mode else self.target_language}")
             async for event in self.connection:
-                # Debug logging for interpretation mode
-                if self.interpretation_mode and event.type == "response.audio_transcript.done":
-                    print(f"[INTERPRETATION] Transcript: {event.transcript[:100]}...")
-                    print(f"[INTERPRETATION] Expected language: {self.interpretation_language}")
                 # Debug logging for function calls
                 if event.type.startswith("response.function_call"):
                     print(f"Function event: {event.type}")
@@ -1542,11 +1615,7 @@ class OpenAIHandler(AsyncStreamHandler):
                 if event.type == "response.audio_transcript.done":
                     output_data = {
                         "event": event,
-                        "language": SUPPORTED_LANGUAGES.get(
-                            self.interpretation_language if self.interpretation_mode else self.target_language,
-                            ""
-                        ) if (self.interpretation_language or self.target_language) else "",
-                        "mode": "interpretation" if self.interpretation_mode else "normal"
                     }
                     await self.output_queue.put(AdditionalOutputs(output_data))
@@ -1607,18 +1676,46 @@ class OpenAIHandler(AsyncStreamHandler):
                             self.current_call_id = None
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
-        if not self.connection:
-            return
-        try:
             _, array = frame
             array = array.squeeze()
-            audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
-            await self.connection.input_audio_buffer.append(audio=audio_message)
-        except Exception as e:
-            print(f"Error in receive: {e}")
-            # Connection might be closed, ignore the error
     async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
         item = await wait_for_item(self.output_queue)
         # Check if it's a dict with text message
@@ -1629,9 +1726,16 @@ class OpenAIHandler(AsyncStreamHandler):
         return item
     async def shutdown(self) -> None:
-        if self.connection:
-            await self.connection.close()
-            self.connection = None
 # Create initial handler instance

 import httpx
 from typing import Optional, List, Dict
 import gradio as gr
+import io
+from scipy import signal
+import wave
 load_dotenv()
             padding: 10px;
             margin-bottom: 10px;
         }
+        .message.assistant.interpretation {
+            background: linear-gradient(135deg, #1a5a3e, #2e7d32);
+            font-style: italic;
+        }
+        .interpretation-arrow {
+            color: #4caf50;
+            font-weight: bold;
+            margin: 0 10px;
+        }
         .controls {
             text-align: center;
             margin-top: auto;
                         </div>
                     </div>
                     <div class="interpretation-info" id="interpretation-info" style="display: none;">
+                        <strong>통역 모드 안내:</strong><br>
+                        • 음성으로 말하면 선택한 언어로 자동 통역됩니다<br>
+                        • Whisper + GPT-4o-mini + TTS를 사용합니다<br>
+                        • 말을 마치고 잠시 기다리면 통역이 시작됩니다
                     </div>
                     <div class="text-input-section">
                         <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
                     if (selectedLanguage && eventJson.language) {
                         content += ` <span class="language-info">[${eventJson.language}]</span>`;
                     } else if (interpretationMode && eventJson.language) {
+                        // In interpretation mode, show the translation process
+                        if (content.includes('→')) {
+                            // Format: "Korean text → English text"
+                            const parts = content.split('→');
+                            if (parts.length === 2) {
+                                content = `<span style="color: #999;">${parts[0].trim()}</span>` +
+                                         `<span class="interpretation-arrow">→</span>` +
+                                         `<strong>${parts[1].trim()}</strong>`;
+                            }
+                        }
                         content += ` <span class="language-info">[통역: ${eventJson.language}]</span>`;
                     }
                     addMessage("assistant", content);
         function addMessage(role, content) {
             const messageDiv = document.createElement('div');
             messageDiv.classList.add('message', role);
+            // Check if it's an interpretation message
+            if (interpretationMode && role === 'assistant' && content.includes('→')) {
+                messageDiv.classList.add('interpretation');
+            }
             if (content.includes('<span')) {
                 messageDiv.innerHTML = content;
             } else {
         self.system_prompt = system_prompt
         self.interpretation_mode = interpretation_mode
         self.interpretation_language = interpretation_language
+        # For interpretation mode
+        self.audio_buffer = []
+        self.is_recording = False
+        self.silence_frames = 0
+        self.silence_threshold = 30  # Number of silent frames before stopping
         print(f"Handler created with web_search_enabled={web_search_enabled}, "
               f"target_language={target_language}, webrtc_id={webrtc_id}, "
               f"interpretation_mode={interpretation_mode}, interpretation_language={interpretation_language}")
                 )
         print(f"Handler.copy() called - creating new handler with default settings")
+        return OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
     async def search_web(self, query: str) -> str:
         """Perform web search and return formatted results"""
             )
             await self.connection.response.create()
+    async def process_interpretation(self):
+        """Process audio buffer for interpretation"""
+        if not self.audio_buffer or not self.interpretation_language:
+            return
+        try:
+            print(f"[INTERPRETATION] Processing audio buffer with {len(self.audio_buffer)} frames")
+            # Convert audio buffer to WAV format
+            audio_data = np.concatenate(self.audio_buffer)
+            # Create WAV file in memory
+            wav_buffer = io.BytesIO()
+            with wave.open(wav_buffer, 'wb') as wav_file:
+                wav_file.setnchannels(1)  # Mono
+                wav_file.setsampwidth(2)   # 16-bit
+                wav_file.setframerate(SAMPLE_RATE)
+                wav_file.writeframes(audio_data.tobytes())
+            wav_buffer.seek(0)
+            wav_buffer.name = "audio.wav"
+            # 1. Transcribe with Whisper
+            print("[INTERPRETATION] Transcribing with Whisper...")
+            transcript = await self.client.audio.transcriptions.create(
+                model="whisper-1",
+                file=wav_buffer,
+                language="ko"  # Assuming Korean input
+            )
+            user_text = transcript.text.strip()
+            print(f"[INTERPRETATION] Transcribed: {user_text}")
+            if not user_text:
+                return
+            # 2. Translate with GPT-4o-mini
+            target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
+            print(f"[INTERPRETATION] Translating to {target_lang_name}...")
+            translation_response = await self.client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": f"You are a translator. Translate the following text to {target_lang_name}. "
+                                  f"Provide only the translation, nothing else."
+                    },
+                    {
+                        "role": "user",
+                        "content": user_text
+                    }
+                ],
+                temperature=0.3,
+                max_tokens=200
+            )
+            translated_text = translation_response.choices[0].message.content.strip()
+            print(f"[INTERPRETATION] Translated: {translated_text}")
+            # 3. Generate speech with TTS
+            print("[INTERPRETATION] Generating speech...")
+            tts_response = await self.client.audio.speech.create(
+                model="tts-1",
+                voice="alloy",
+                input=translated_text,
+                response_format="pcm",  # PCM format for direct playback
+                speed=1.0
+            )
+            # Convert response to bytes
+            audio_bytes = b""
+            async for chunk in tts_response.iter_bytes(1024):
+                audio_bytes += chunk
+            # Convert PCM to numpy array
+            audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
+            # Resample from 24kHz (TTS output) to our sample rate if needed
+            if len(audio_array) > 0:
+                # Split audio into chunks and send
+                chunk_size = 480  # Match our frame size
+                for i in range(0, len(audio_array), chunk_size):
+                    chunk = audio_array[i:i + chunk_size]
+                    if len(chunk) < chunk_size:
+                        # Pad the last chunk if necessary
+                        chunk = np.pad(chunk, (0, chunk_size - len(chunk)), 'constant')
+                    await self.output_queue.put((SAMPLE_RATE, chunk.reshape(1, -1)))
+            # Send transcript event
+            output_data = {
+                "event": type('Event', (), {
+                    'transcript': f"{user_text} → {translated_text}"
+                })(),
+                "language": target_lang_name,
+                "mode": "interpretation"
+            }
+            await self.output_queue.put(AdditionalOutputs(output_data))
+        except Exception as e:
+            print(f"[INTERPRETATION] Error: {e}")
+            # Send error message to client
+            error_data = {
+                "event": type('Event', (), {
+                    'transcript': f"통역 오류: {str(e)}"
+                })(),
+                "language": "",
+                "mode": "error"
+            }
+            await self.output_queue.put(AdditionalOutputs(error_data))
+        finally:
+            # Clear the audio buffer
+            self.audio_buffer = []
+            self.is_recording = False
+            self.silence_frames = 0
     def get_translation_instructions(self):
         """Get instructions for translation based on target language"""
         )
     async def start_up(self):
+        """Connect to realtime API or setup interpretation mode"""
         # First check if we have the most recent settings
         if connection_settings:
             recent_ids = sorted(connection_settings.keys(),
         print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
               f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
               f"interpretation_language={self.interpretation_language}")
         self.client = openai.AsyncOpenAI()
+        # If in interpretation mode, don't connect to Realtime API
+        if self.interpretation_mode:
+            print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
+            print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
+            # Just keep the handler ready to process audio
+            return
+        # Normal mode - connect to Realtime API
         # Define the web search function
         tools = []
         base_instructions = self.system_prompt or "You are a helpful assistant."
+        # Add translation instructions if language is selected
+        translation_instructions = self.get_translation_instructions()
+        if self.web_search_enabled and self.search_client:
+            tools = [{
+                "type": "function",
+                "function": {
+                    "name": "web_search",
+                    "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "query": {
+                                "type": "string",
+                                "description": "The search query"
+                            }
+                        },
+                        "required": ["query"]
+                    }
+                }
+            }]
+            print("Web search function added to tools")
+            search_instructions = (
+                "\n\nYou have web search capabilities. "
+                "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
+                "- Weather (날씨, 기온, 비, 눈)\n"
+                "- News (뉴스, 소식)\n"
+                "- Current events (현재, 최근, 오늘, 지금)\n"
+                "- Prices (가격, 환율, 주가)\n"
+                "- Sports scores or results\n"
+                "- Any question about 2024 or 2025\n"
+                "- Any time-sensitive information\n\n"
+                "When in doubt, USE web_search. It's better to search and provide accurate information "
+                "than to guess or use outdated information."
             )
+            instructions = base_instructions + search_instructions + translation_instructions
         else:
+            instructions = base_instructions + translation_instructions
         async with self.client.beta.realtime.connect(
             model="gpt-4o-mini-realtime-preview-2024-12-17"
         ) as conn:
             # Update session with tools
             session_update = {
+                "turn_detection": {"type": "server_vad"},
                 "instructions": instructions,
                 "tools": tools,
+                "tool_choice": "auto" if tools else "none"
             }
+            # Add voice setting if target language is selected
+            if self.target_language:
+                # Map languages to appropriate voices
+                voice_map = {
+                    "en": "alloy",
+                    "es": "nova",
+                    "fr": "nova",
+                    "de": "nova",
+                    "ja": "nova",
+                    "zh": "nova",
+                    # Default to alloy for other languages
                 }
+                session_update["voice"] = voice_map.get(self.target_language, "alloy")
             await conn.session.update(session=session_update)
             self.connection = conn
+            print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
             async for event in self.connection:
                 # Debug logging for function calls
                 if event.type.startswith("response.function_call"):
                     print(f"Function event: {event.type}")
                 if event.type == "response.audio_transcript.done":
                     output_data = {
                         "event": event,
+                        "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
                     }
                     await self.output_queue.put(AdditionalOutputs(output_data))
                             self.current_call_id = None
     async def receive(self, frame: tuple[int, np.ndarray]) -> None:
+        if self.interpretation_mode:
+            # In interpretation mode, buffer audio and process with Whisper
             _, array = frame
             array = array.squeeze()
+            # Simple voice activity detection
+            audio_level = np.abs(array).mean()
+            if audio_level > 300:  # Lowered threshold for better detection
+                self.is_recording = True
+                self.silence_frames = 0
+                self.audio_buffer.append(array)
+                if len(self.audio_buffer) % 10 == 0:  # Log every 10 frames
+                    print(f"[INTERPRETATION] Recording... buffer size: {len(self.audio_buffer)}, level: {audio_level:.1f}")
+            elif self.is_recording:
+                self.silence_frames += 1
+                self.audio_buffer.append(array)
+                # If we've had enough silence, process the audio
+                if self.silence_frames > self.silence_threshold:
+                    print(f"[INTERPRETATION] Silence detected, processing {len(self.audio_buffer)} frames")
+                    await self.process_interpretation()
+        else:
+            # Normal mode - use Realtime API
+            if not self.connection:
+                return
+            try:
+                _, array = frame
+                array = array.squeeze()
+                audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
+                await self.connection.input_audio_buffer.append(audio=audio_message)
+            except Exception as e:
+                print(f"Error in receive: {e}")
+                # Connection might be closed, ignore the error
     async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
+        # In interpretation mode, check if we need to process buffered audio
+        if self.interpretation_mode and self.is_recording and self.silence_frames > self.silence_threshold:
+            await self.process_interpretation()
         item = await wait_for_item(self.output_queue)
         # Check if it's a dict with text message
         return item
     async def shutdown(self) -> None:
+        if self.interpretation_mode:
+            # Clean up interpretation mode
+            self.audio_buffer = []
+            self.is_recording = False
+            print("[INTERPRETATION MODE] Shutdown complete")
+        else:
+            # Normal mode - close Realtime API connection
+            if self.connection:
+                await self.connection.close()
+                self.connection = None
 # Create initial handler instance