Private-AI

Running

App Files Files Community

seawolf2357 commited on 23 days ago

Commit

f6a65af

verified ·

1 Parent(s): 24f7b71

Update app.py

Browse files

Files changed (1) hide show

app.py +219 -41

app.py CHANGED Viewed

@@ -1211,6 +1211,17 @@ connection_settings = {}
 # Initialize OpenAI client for text chat
 client = openai.AsyncOpenAI()
 def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
     chatbot.append({"role": "assistant", "content": response.transcript})
     return chatbot
@@ -1234,10 +1245,38 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
     try:
         # Prepare system message
         base_instructions = system_prompt or "You are a helpful assistant."
-        translation_instructions = get_translation_instructions(target_language)
         messages = [
-            {"role": "system", "content": base_instructions + translation_instructions}
         ]
         # Handle web search if enabled
@@ -1264,7 +1303,7 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
         messages.append({"role": "user", "content": message})
-        # Call GPT-4o-mini
         response = await client.chat.completions.create(
             model="gpt-4o-mini",
             messages=messages,
@@ -1272,8 +1311,14 @@ async def process_text_chat(message: str, web_search_enabled: bool, target_langu
             max_tokens=2000
         )
         return {
-            "response": response.choices[0].message.content,
             "language": SUPPORTED_LANGUAGES.get(target_language, "") if target_language else ""
         }
@@ -1309,7 +1354,8 @@ class OpenAIHandler(AsyncStreamHandler):
         self.audio_buffer = []
         self.is_recording = False
         self.silence_frames = 0
-        self.silence_threshold = 30  # Number of silent frames before stopping
         print(f"Handler created with web_search_enabled={web_search_enabled}, "
               f"target_language={target_language}, webrtc_id={webrtc_id}, "
@@ -1409,21 +1455,32 @@ class OpenAIHandler(AsyncStreamHandler):
             # 2. Translate with GPT-4o-mini
             target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
             print(f"[INTERPRETATION] Translating to {target_lang_name}...")
             translation_response = await self.client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
                     {
                         "role": "system",
-                        "content": f"You are a translator. Translate the following text to {target_lang_name}. "
-                                  f"Provide only the translation, nothing else."
                     },
                     {
                         "role": "user",
                         "content": user_text
                     }
                 ],
-                temperature=0.3,
                 max_tokens=200
             )
@@ -1432,9 +1489,25 @@ class OpenAIHandler(AsyncStreamHandler):
             # 3. Generate speech with TTS
             print("[INTERPRETATION] Generating speech...")
             tts_response = await self.client.audio.speech.create(
                 model="tts-1",
-                voice="alloy",
                 input=translated_text,
                 response_format="pcm",  # PCM format for direct playback
                 speed=1.0
@@ -1445,10 +1518,10 @@ class OpenAIHandler(AsyncStreamHandler):
             async for chunk in tts_response.iter_bytes(1024):
                 audio_bytes += chunk
-            # Convert PCM to numpy array
             audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
-            # Resample from 24kHz (TTS output) to our sample rate if needed
             if len(audio_array) > 0:
                 # Split audio into chunks and send
                 chunk_size = 480  # Match our frame size
@@ -1472,6 +1545,9 @@ class OpenAIHandler(AsyncStreamHandler):
         except Exception as e:
             print(f"[INTERPRETATION] Error: {e}")
             # Send error message to client
             error_data = {
                 "event": type('Event', (), {
@@ -1531,6 +1607,15 @@ class OpenAIHandler(AsyncStreamHandler):
             print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
             print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
             # Just keep the handler ready to process audio
             return
         # Normal mode - connect to Realtime API
@@ -1539,7 +1624,68 @@ class OpenAIHandler(AsyncStreamHandler):
         base_instructions = self.system_prompt or "You are a helpful assistant."
         # Add translation instructions if language is selected
-        translation_instructions = self.get_translation_instructions()
         if self.web_search_enabled and self.search_client:
             tools = [{
@@ -1574,10 +1720,12 @@ class OpenAIHandler(AsyncStreamHandler):
                 "When in doubt, USE web_search. It's better to search and provide accurate information "
                 "than to guess or use outdated information."
             )
-            instructions = base_instructions + search_instructions + translation_instructions
         else:
             instructions = base_instructions + translation_instructions
         async with self.client.beta.realtime.connect(
             model="gpt-4o-mini-realtime-preview-2024-12-17"
         ) as conn:
@@ -1589,19 +1737,39 @@ class OpenAIHandler(AsyncStreamHandler):
                 "tool_choice": "auto" if tools else "none"
             }
-            # Add voice setting if target language is selected
             if self.target_language:
-                # Map languages to appropriate voices
                 voice_map = {
-                    "en": "alloy",
-                    "es": "nova",
-                    "fr": "nova",
-                    "de": "nova",
-                    "ja": "nova",
-                    "zh": "nova",
-                    # Default to alloy for other languages
                 }
-                session_update["voice"] = voice_map.get(self.target_language, "alloy")
             await conn.session.update(session=session_update)
             self.connection = conn
@@ -1613,6 +1781,9 @@ class OpenAIHandler(AsyncStreamHandler):
                     print(f"Function event: {event.type}")
                 if event.type == "response.audio_transcript.done":
                     output_data = {
                         "event": event,
                         "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
@@ -1684,20 +1855,21 @@ class OpenAIHandler(AsyncStreamHandler):
             # Simple voice activity detection
             audio_level = np.abs(array).mean()
-            if audio_level > 300:  # Lowered threshold for better detection
                 self.is_recording = True
                 self.silence_frames = 0
                 self.audio_buffer.append(array)
-                if len(self.audio_buffer) % 10 == 0:  # Log every 10 frames
-                    print(f"[INTERPRETATION] Recording... buffer size: {len(self.audio_buffer)}, level: {audio_level:.1f}")
             elif self.is_recording:
                 self.silence_frames += 1
                 self.audio_buffer.append(array)
                 # If we've had enough silence, process the audio
-                if self.silence_frames > self.silence_threshold:
-                    print(f"[INTERPRETATION] Silence detected, processing {len(self.audio_buffer)} frames")
-                    await self.process_interpretation()
         else:
             # Normal mode - use Realtime API
             if not self.connection:
@@ -1712,18 +1884,24 @@ class OpenAIHandler(AsyncStreamHandler):
                 # Connection might be closed, ignore the error
     async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
-        # In interpretation mode, check if we need to process buffered audio
-        if self.interpretation_mode and self.is_recording and self.silence_frames > self.silence_threshold:
-            await self.process_interpretation()
-        item = await wait_for_item(self.output_queue)
-        # Check if it's a dict with text message
-        if isinstance(item, dict) and item.get('type') == 'text_message':
-            await self.process_text_message(item['content'])
-            return None
-        return item
     async def shutdown(self) -> None:
         if self.interpretation_mode:

 # Initialize OpenAI client for text chat
 client = openai.AsyncOpenAI()
+def get_translation_instructions(target_language: str) -> str:
+    """Get instructions for translation based on target language"""
+    if not target_language:
+        return ""
+    language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
+    return (
+        f"\n\nIMPORTANT: You must respond in {language_name} ({target_language}). "
+        f"Translate all your responses to {language_name}."
+    )
 def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
     chatbot.append({"role": "assistant", "content": response.transcript})
     return chatbot
     try:
         # Prepare system message
         base_instructions = system_prompt or "You are a helpful assistant."
+        # Add strong language instructions if language is selected
+        if target_language:
+            language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
+            # Language-specific instructions
+            if target_language == "en":
+                lang_specific = "\nYou MUST respond in English ONLY. Never use Korean or any other language."
+            elif target_language == "ja":
+                lang_specific = "\n日本語でのみ応答してください。韓国語や他の言語は使用しないでください。"
+            elif target_language == "zh":
+                lang_specific = "\n只能用中文回答。不要使用韩语或其他任何语言。"
+            elif target_language == "es":
+                lang_specific = "\nDebe responder SOLO en español. Nunca use coreano u otros idiomas."
+            elif target_language == "fr":
+                lang_specific = "\nVous devez répondre UNIQUEMENT en français. N'utilisez jamais le coréen ou d'autres langues."
+            elif target_language == "de":
+                lang_specific = "\nSie müssen NUR auf Deutsch antworten. Verwenden Sie niemals Koreanisch oder andere Sprachen."
+            else:
+                lang_specific = f"\nYou MUST respond ONLY in {language_name}. Never use any other language."
+            translation_instructions = (
+                f"\n\nIMPORTANT: Your response language is set to {language_name} ({target_language})."
+                f"{lang_specific}"
+                f"\nEven if the user writes in Korean or another language, you must ALWAYS respond in {language_name}."
+                f"\nThis is a strict requirement. Output language: {language_name} ONLY."
+            )
+            base_instructions = base_instructions + translation_instructions
         messages = [
+            {"role": "system", "content": base_instructions}
         ]
         # Handle web search if enabled
         messages.append({"role": "user", "content": message})
+        # Call GPT-4o-mini with strong language enforcement
         response = await client.chat.completions.create(
             model="gpt-4o-mini",
             messages=messages,
             max_tokens=2000
         )
+        response_text = response.choices[0].message.content
+        # Debug logging
+        print(f"[TEXT CHAT] Target language: {target_language}")
+        print(f"[TEXT CHAT] Response preview: {response_text[:100]}...")
         return {
+            "response": response_text,
             "language": SUPPORTED_LANGUAGES.get(target_language, "") if target_language else ""
         }
         self.audio_buffer = []
         self.is_recording = False
         self.silence_frames = 0
+        self.silence_threshold = 20  # Reduced for faster response (20 frames = ~0.4 seconds)
+        self.min_audio_length = 10  # Minimum frames to consider as speech
         print(f"Handler created with web_search_enabled={web_search_enabled}, "
               f"target_language={target_language}, webrtc_id={webrtc_id}, "
             # 2. Translate with GPT-4o-mini
             target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
+            # Create very specific translation prompt
+            if self.interpretation_language == "en":
+                system_prompt = "You are a translator. Translate Korean to English. Output ONLY the English translation, nothing else."
+            elif self.interpretation_language == "ja":
+                system_prompt = "You are a translator. Translate Korean to Japanese. Output ONLY the Japanese translation (日本語のみ), nothing else."
+            elif self.interpretation_language == "zh":
+                system_prompt = "You are a translator. Translate Korean to Chinese. Output ONLY the Chinese translation (只输出中文), nothing else."
+            elif self.interpretation_language == "es":
+                system_prompt = "You are a translator. Translate Korean to Spanish. Output ONLY the Spanish translation (solo español), nothing else."
+            else:
+                system_prompt = f"You are a translator. Translate Korean to {target_lang_name}. Output ONLY the {target_lang_name} translation, nothing else."
             print(f"[INTERPRETATION] Translating to {target_lang_name}...")
             translation_response = await self.client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
                     {
                         "role": "system",
+                        "content": system_prompt
                     },
                     {
                         "role": "user",
                         "content": user_text
                     }
                 ],
+                temperature=0.1,  # Lower temperature for more literal translation
                 max_tokens=200
             )
             # 3. Generate speech with TTS
             print("[INTERPRETATION] Generating speech...")
+            # Select appropriate voice for the language
+            # Using voices that work better for each language
+            voice_map = {
+                "en": "nova",      # Nova has clear English pronunciation
+                "es": "nova",      # Nova works well for Spanish
+                "fr": "shimmer",   # Shimmer works well for French
+                "de": "onyx",      # Onyx works well for German
+                "ja": "nova",      # Nova can handle Japanese
+                "zh": "nova",      # Nova can handle Chinese
+                "ko": "nova",      # Nova can handle Korean
+            }
+            selected_voice = voice_map.get(self.interpretation_language, "nova")
+            print(f"[INTERPRETATION] Using voice: {selected_voice} for language: {self.interpretation_language}")
             tts_response = await self.client.audio.speech.create(
                 model="tts-1",
+                voice=selected_voice,
                 input=translated_text,
                 response_format="pcm",  # PCM format for direct playback
                 speed=1.0
             async for chunk in tts_response.iter_bytes(1024):
                 audio_bytes += chunk
+            # Convert PCM to numpy array (TTS outputs at 24kHz)
             audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
+            # Send audio in chunks
             if len(audio_array) > 0:
                 # Split audio into chunks and send
                 chunk_size = 480  # Match our frame size
         except Exception as e:
             print(f"[INTERPRETATION] Error: {e}")
+            import traceback
+            traceback.print_exc()
             # Send error message to client
             error_data = {
                 "event": type('Event', (), {
             print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
             print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
             # Just keep the handler ready to process audio
+            # Create an infinite loop to keep the handler alive
+            try:
+                while True:
+                    await asyncio.sleep(0.1)
+                    # Check if we need to process any audio
+                    if self.is_recording and self.silence_frames > self.silence_threshold:
+                        await self.process_interpretation()
+            except asyncio.CancelledError:
+                print("[INTERPRETATION MODE] Handler cancelled")
             return
         # Normal mode - connect to Realtime API
         base_instructions = self.system_prompt or "You are a helpful assistant."
         # Add translation instructions if language is selected
+        if self.target_language:
+            language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
+            # Create base translation instruction
+            base_translation = f"You MUST respond ONLY in {language_name}."
+            # Add language-specific instructions with examples
+            if self.target_language == "en":
+                translation_instructions = f"""
+{base_translation}
+CRITICAL RULES:
+1. EVERY word you say must be in English.
+2. Do NOT use Korean (한국어) at all.
+3. If user speaks Korean, understand it but ALWAYS reply in English.
+4. Example: User says "안녕하세요" → You say "Hello! How can I help you today?"
+5. Your language mode is: ENGLISH ONLY.
+"""
+            elif self.target_language == "ja":
+                translation_instructions = f"""
+{base_translation}
+重要なルール：
+1. 必ず日本語のみで応答してください。
+2. 韓国語は一切使用しないでください。
+3. ユーザーが韓国語で話しても、必ず日本語で返答してください。
+4. 例：ユーザーが「안녕하세요」と言ったら → あなたは「こんにちは！今日はどのようにお手伝いできますか？」と言う
+5. 言語モード：日本語のみ
+"""
+            elif self.target_language == "zh":
+                translation_instructions = f"""
+{base_translation}
+重要规则：
+1. 必须只用中文回答。
+2. 绝对不要使用韩语。
+3. 即使用户说韩语，也必须用中文回复。
+4. 例如：用户说"안녕하세요" → 你说"你好！我能为您做什么？"
+5. 语言模式：仅中文
+"""
+            elif self.target_language == "es":
+                translation_instructions = f"""
+{base_translation}
+REGLAS CRÍTICAS:
+1. TODAS tus palabras deben estar en español.
+2. NO uses coreano en absoluto.
+3. Si el usuario habla coreano, entiéndelo pero SIEMPRE responde en español.
+4. Ejemplo: Usuario dice "안녕하세요" → Tú dices "¡Hola! ¿Cómo puedo ayudarte hoy?"
+5. Modo de idioma: SOLO ESPAÑOL
+"""
+            else:
+                translation_instructions = f"""
+{base_translation}
+RULES:
+1. You must ONLY speak in {language_name}.
+2. NEVER use Korean or any other language.
+3. Always respond in {language_name} regardless of what language the user speaks.
+"""
+        else:
+            translation_instructions = ""
         if self.web_search_enabled and self.search_client:
             tools = [{
                 "When in doubt, USE web_search. It's better to search and provide accurate information "
                 "than to guess or use outdated information."
             )
+            instructions = base_instructions + translation_instructions + search_instructions
         else:
             instructions = base_instructions + translation_instructions
+        print(f"[NORMAL MODE] Instructions: {instructions[:200]}...")
         async with self.client.beta.realtime.connect(
             model="gpt-4o-mini-realtime-preview-2024-12-17"
         ) as conn:
                 "tool_choice": "auto" if tools else "none"
             }
+            # Use appropriate voice for the language
             if self.target_language:
+                # Use voice that works better for each language
                 voice_map = {
+                    "en": "nova",      # Nova has clear pronunciation
+                    "es": "nova",      # Nova works well for Spanish
+                    "fr": "shimmer",   # Shimmer for French
+                    "de": "onyx",      # Onyx for German
+                    "ja": "nova",      # Nova can handle Japanese
+                    "zh": "nova",      # Nova can handle Chinese
+                    "ko": "nova",      # Nova can handle Korean
                 }
+                session_update["voice"] = voice_map.get(self.target_language, "nova")
+                # Force output language settings
+                session_update["modalities"] = ["text", "audio"]
+                session_update["output_audio_format"] = "pcm16"
+                # Add extra language enforcement in system message
+                if self.target_language == "en":
+                    extra_instruction = "\n\nREMINDER: Speak in English only. 英語のみで話してください。"
+                elif self.target_language == "ja":
+                    extra_instruction = "\n\nREMINDER: 日本語のみで話してください。Speak in Japanese only."
+                elif self.target_language == "zh":
+                    extra_instruction = "\n\nREMINDER: 只说中文。Speak in Chinese only."
+                else:
+                    extra_instruction = ""
+                session_update["instructions"] = instructions + extra_instruction
+                print(f"[TRANSLATION MODE] Target language: {self.target_language}")
+                print(f"[TRANSLATION MODE] Voice: {session_update['voice']}")
+                print(f"[TRANSLATION MODE] Instructions preview: {session_update['instructions'][:200]}...")
             await conn.session.update(session=session_update)
             self.connection = conn
                     print(f"Function event: {event.type}")
                 if event.type == "response.audio_transcript.done":
+                    print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
+                    print(f"[RESPONSE] Expected language: {self.target_language}")
                     output_data = {
                         "event": event,
                         "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
             # Simple voice activity detection
             audio_level = np.abs(array).mean()
+            if audio_level > 200:  # Lower threshold for better detection
+                if not self.is_recording:
+                    print(f"[INTERPRETATION] Started recording, level: {audio_level:.1f}")
                 self.is_recording = True
                 self.silence_frames = 0
                 self.audio_buffer.append(array)
             elif self.is_recording:
                 self.silence_frames += 1
                 self.audio_buffer.append(array)
                 # If we've had enough silence, process the audio
+                if self.silence_frames > self.silence_threshold and len(self.audio_buffer) > self.min_audio_length:
+                    print(f"[INTERPRETATION] Silence detected after {len(self.audio_buffer)} frames")
+                    # Process in the background to avoid blocking
+                    asyncio.create_task(self.process_interpretation())
         else:
             # Normal mode - use Realtime API
             if not self.connection:
                 # Connection might be closed, ignore the error
     async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
+        # In interpretation mode, we need to keep checking for audio
+        if self.interpretation_mode:
+            # Use a timeout to prevent blocking forever
+            try:
+                item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.1)
+                return item
+            except asyncio.TimeoutError:
+                return None
+        else:
+            # Normal mode
+            item = await wait_for_item(self.output_queue)
+            # Check if it's a dict with text message
+            if isinstance(item, dict) and item.get('type') == 'text_message':
+                await self.process_text_message(item['content'])
+                return None
+            return item
     async def shutdown(self) -> None:
         if self.interpretation_mode: