Mouth of 'MOUSE'

import asyncio import base64 import json from pathlib import Path import os import numpy as np import openai from dotenv import load_dotenv from fastapi import FastAPI, Request from fastapi.responses import HTMLResponse, StreamingResponse from fastrtc import ( AdditionalOutputs, AsyncStreamHandler, Stream, get_twilio_turn_credentials, wait_for_item, ) from gradio.utils import get_space from openai.types.beta.realtime import ResponseAudioTranscriptDoneEvent import httpx from typing import Optional, List, Dict import gradio as gr load_dotenv() SAMPLE_RATE = 24000 # Supported languages for OpenAI Realtime API SUPPORTED_LANGUAGES = { "ko": "한국어 (Korean)", "en": "English", "es": "Español (Spanish)", "fr": "Français (French)", "de": "Deutsch (German)", "it": "Italiano (Italian)", "pt": "Português (Portuguese)", "ru": "Русский (Russian)", "ja": "日本語 (Japanese)", "zh": "中文 (Chinese)", "ar": "العربية (Arabic)", "hi": "हिन्दी (Hindi)", "nl": "Nederlands (Dutch)", "pl": "Polski (Polish)", "tr": "Türkçe (Turkish)", "vi": "Tiếng Việt (Vietnamese)", "th": "ไทย (Thai)", "id": "Bahasa Indonesia", "sv": "Svenska (Swedish)", "da": "Dansk (Danish)", "no": "Norsk (Norwegian)", "fi": "Suomi (Finnish)", "he": "עברית (Hebrew)", "uk": "Українська (Ukrainian)", "cs": "Čeština (Czech)", "el": "Ελληνικά (Greek)", "ro": "Română (Romanian)", "hu": "Magyar (Hungarian)", "ms": "Bahasa Melayu (Malay)" } # HTML content embedded as a string HTML_CONTENT = """ Mouth of 'MOUSE'

설정

웹 검색

자동 번역

자동 통역

시스템 프롬프트:

대화

""" class BraveSearchClient: """Brave Search API client""" def __init__(self, api_key: str): self.api_key = api_key self.base_url = "https://api.search.brave.com/res/v1/web/search" async def search(self, query: str, count: int = 10) -> List[Dict]: """Perform a web search using Brave Search API""" if not self.api_key: return [] headers = { "Accept": "application/json", "X-Subscription-Token": self.api_key } params = { "q": query, "count": count, "lang": "ko" } async with httpx.AsyncClient() as client: try: response = await client.get(self.base_url, headers=headers, params=params) response.raise_for_status() data = response.json() results = [] if "web" in data and "results" in data["web"]: for result in data["web"]["results"][:count]: results.append({ "title": result.get("title", ""), "url": result.get("url", ""), "description": result.get("description", "") }) return results except Exception as e: print(f"Brave Search error: {e}") return [] # Initialize search client globally brave_api_key = os.getenv("BSEARCH_API") search_client = BraveSearchClient(brave_api_key) if brave_api_key else None print(f"Search client initialized: {search_client is not None}, API key present: {bool(brave_api_key)}") # Store connection settings connection_settings = {} # Initialize OpenAI client for text chat client = openai.AsyncOpenAI() def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent): chatbot.append({"role": "assistant", "content": response.transcript}) return chatbot def get_translation_instructions(target_language: str) -> str: """Get instructions for translation based on target language""" if not target_language: return "" language_name = SUPPORTED_LANGUAGES.get(target_language, target_language) return ( f"\n\nIMPORTANT: You must respond in {language_name} ({target_language}). " f"Translate all your responses to {language_name}." ) async def process_text_chat(message: str, web_search_enabled: bool, target_language: str, system_prompt: str) -> Dict[str, str]: """Process text chat using GPT-4o-mini model""" try: # Prepare system message base_instructions = system_prompt or "You are a helpful assistant." translation_instructions = get_translation_instructions(target_language) messages = [ {"role": "system", "content": base_instructions + translation_instructions} ] # Handle web search if enabled if web_search_enabled and search_client: # Check if the message requires web search search_keywords = ["날씨", "기온", "비", "눈", "뉴스", "소식", "현재", "최근", "오늘", "지금", "가격", "환율", "주가", "weather", "news", "current", "today", "price", "2024", "2025"] should_search = any(keyword in message.lower() for keyword in search_keywords) if should_search: # Perform web search search_results = await search_client.search(message) if search_results: search_context = "웹 검색 결과:\n\n" for i, result in enumerate(search_results[:5], 1): search_context += f"{i}. {result['title']}\n{result['description']}\n\n" messages.append({ "role": "system", "content": f"다음 웹 검색 결과를 참고하여 답변하세요:\n\n{search_context}" }) messages.append({"role": "user", "content": message}) # Call GPT-4o-mini response = await client.chat.completions.create( model="gpt-4o-mini", messages=messages, temperature=0.7, max_tokens=2000 ) return { "response": response.choices[0].message.content, "language": SUPPORTED_LANGUAGES.get(target_language, "") if target_language else "" } except Exception as e: print(f"Error in text chat: {e}") return {"error": str(e)} class OpenAIHandler(AsyncStreamHandler): def __init__(self, web_search_enabled: bool = False, target_language: str = "", system_prompt: str = "", webrtc_id: str = None, interpretation_mode: bool = False, interpretation_language: str = "") -> None: super().__init__( expected_layout="mono", output_sample_rate=SAMPLE_RATE, output_frame_size=480, input_sample_rate=SAMPLE_RATE, ) self.connection = None self.output_queue = asyncio.Queue() self.search_client = search_client self.function_call_in_progress = False self.current_function_args = "" self.current_call_id = None self.webrtc_id = webrtc_id self.web_search_enabled = web_search_enabled self.target_language = target_language self.system_prompt = system_prompt self.interpretation_mode = interpretation_mode self.interpretation_language = interpretation_language print(f"Handler created with web_search_enabled={web_search_enabled}, " f"target_language={target_language}, webrtc_id={webrtc_id}, " f"interpretation_mode={interpretation_mode}, interpretation_language={interpretation_language}") def copy(self): # Get the most recent settings if connection_settings: # Get the most recent webrtc_id recent_ids = sorted(connection_settings.keys(), key=lambda k: connection_settings[k].get('timestamp', 0), reverse=True) if recent_ids: recent_id = recent_ids[0] settings = connection_settings[recent_id] return OpenAIHandler( web_search_enabled=settings.get('web_search_enabled', False), target_language=settings.get('target_language', ''), system_prompt=settings.get('system_prompt', ''), webrtc_id=recent_id, interpretation_mode=settings.get('interpretation_mode', False), interpretation_language=settings.get('interpretation_language', '') ) print(f"Handler.copy() called - creating new handler with default settings") return OpenAIHandler(web_search_enabled=False) async def search_web(self, query: str) -> str: """Perform web search and return formatted results""" if not self.search_client or not self.web_search_enabled: return "웹 검색이 비활성화되어 있습니다." print(f"Searching web for: {query}") results = await self.search_client.search(query) if not results: return f"'{query}'에 대한 검색 결과를 찾을 수 없습니다." # Format search results formatted_results = [] for i, result in enumerate(results, 1): formatted_results.append( f"{i}. {result['title']}\n" f" URL: {result['url']}\n" f" {result['description']}\n" ) return f"웹 검색 결과 '{query}':\n\n" + "\n".join(formatted_results) async def process_text_message(self, message: str): """Process text message from user""" if self.connection: await self.connection.conversation.item.create( item={ "type": "message", "role": "user", "content": [{"type": "input_text", "text": message}] } ) await self.connection.response.create() def get_interpretation_instructions(self): """Get instructions for interpretation mode""" if not self.interpretation_mode or not self.interpretation_language: return "" target_language_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language) target_code = self.interpretation_language # Language-specific examples examples = { "en": "Hello, the weather is nice today", "es": "Hola, el clima está agradable hoy", "fr": "Bonjour, il fait beau aujourd'hui", "de": "Hallo, das Wetter ist heute schön", "ja": "こんにちは、今日はいい天気ですね", "zh": "你好，今天天气很好" } example_translation = examples.get(target_code, "Hello, the weather is nice today") return ( f"INTERPRETATION MODE - CRITICAL RULES:\n\n" f"1. You are ONLY a translator to {target_language_name} (language code: {target_code}).\n" f"2. NEVER respond in any other language.\n" f"3. NEVER generate conversation or additional content.\n" f"4. ONLY translate what the user says.\n" f"5. STOP immediately after translating.\n\n" f"Example:\n" f"If user says: '안녕하세요, 오늘 날씨가 좋네요'\n" f"You MUST respond ONLY: '{example_translation}'\n\n" f"DO NOT say anything else. DO NOT continue talking.\n" f"Your output language MUST be {target_language_name} ONLY." ) def get_translation_instructions(self): """Get instructions for translation based on target language""" if not self.target_language or self.interpretation_mode: return "" language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language) return ( f"\n\nIMPORTANT: You must respond in {language_name} ({self.target_language}). " f"Translate all your responses to {language_name}. " f"This includes both spoken and written responses." ) async def start_up(self): """Connect to realtime API with function calling enabled""" # First check if we have the most recent settings if connection_settings: recent_ids = sorted(connection_settings.keys(), key=lambda k: connection_settings[k].get('timestamp', 0), reverse=True) if recent_ids: recent_id = recent_ids[0] settings = connection_settings[recent_id] self.web_search_enabled = settings.get('web_search_enabled', False) self.target_language = settings.get('target_language', '') self.system_prompt = settings.get('system_prompt', '') self.interpretation_mode = settings.get('interpretation_mode', False) self.interpretation_language = settings.get('interpretation_language', '') self.webrtc_id = recent_id print(f"start_up: Updated settings from storage - webrtc_id={self.webrtc_id}, " f"web_search_enabled={self.web_search_enabled}, target_language={self.target_language}, " f"interpretation_mode={self.interpretation_mode}") print(f"Handler interpretation settings: mode={self.interpretation_mode}, language={self.interpretation_language}") print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, " f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, " f"interpretation_language={self.interpretation_language}") self.client = openai.AsyncOpenAI() # Define the web search function tools = [] base_instructions = self.system_prompt or "You are a helpful assistant." # Check if in interpretation mode if self.interpretation_mode: # In interpretation mode, override all instructions base_instructions = ( f"You are a professional interpreter. Your ONLY task is to translate what the user says " f"into {SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)}. " f"Do not add any commentary, do not continue the conversation, do not generate new content. " f"Simply translate what was said and stop." ) interpretation_instructions = self.get_interpretation_instructions() instructions = base_instructions + "\n\n" + interpretation_instructions # No tools in interpretation mode tools = [] print(f"Interpretation mode active - target language: {self.interpretation_language}") else: # Normal mode - add translation instructions if language is selected translation_instructions = self.get_translation_instructions() if self.web_search_enabled and self.search_client: tools = [{ "type": "function", "function": { "name": "web_search", "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "The search query" } }, "required": ["query"] } } }] print("Web search function added to tools") search_instructions = ( "\n\nYou have web search capabilities. " "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n" "- Weather (날씨, 기온, 비, 눈)\n" "- News (뉴스, 소식)\n" "- Current events (현재, 최근, 오늘, 지금)\n" "- Prices (가격, 환율, 주가)\n" "- Sports scores or results\n" "- Any question about 2024 or 2025\n" "- Any time-sensitive information\n\n" "When in doubt, USE web_search. It's better to search and provide accurate information " "than to guess or use outdated information." ) instructions = base_instructions + search_instructions + translation_instructions else: instructions = base_instructions + translation_instructions async with self.client.beta.realtime.connect( model="gpt-4o-mini-realtime-preview-2024-12-17" ) as conn: # Update session with tools session_update = { "turn_detection": { "type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300, "silence_duration_ms": 500 if self.interpretation_mode else 700 }, "instructions": instructions, "tools": tools, "tool_choice": "auto" if tools else "none" } # Add voice setting based on interpretation or translation language voice_language = self.interpretation_language if self.interpretation_mode else self.target_language if voice_language: # Use only alloy voice to avoid language confusion # The model will handle the language based on instructions session_update["voice"] = "alloy" # For interpretation mode, explicitly set the output language if self.interpretation_mode: session_update["output_audio_format"] = "pcm16" print(f"Voice set to: alloy for language: {voice_language}") # For interpretation mode, ensure proper language settings if self.interpretation_mode and self.interpretation_language: session_update["modalities"] = ["text", "audio"] session_update["temperature"] = 0.3 # Lower temperature for more accurate translation session_update["max_response_output_tokens"] = 500 # Limit output to prevent long generations print(f"Interpretation session config: voice={session_update.get('voice')}, lang={self.interpretation_language}") await conn.session.update(session=session_update) self.connection = conn print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}, " f"interpretation_mode: {self.interpretation_mode}, language: {self.interpretation_language if self.interpretation_mode else self.target_language}") async for event in self.connection: # Debug logging for function calls if event.type.startswith("response.function_call"): print(f"Function event: {event.type}") if event.type == "response.audio_transcript.done": output_data = { "event": event, "language": SUPPORTED_LANGUAGES.get( self.interpretation_language if self.interpretation_mode else self.target_language, "" ) if (self.interpretation_language or self.target_language) else "" } await self.output_queue.put(AdditionalOutputs(output_data)) elif event.type == "response.audio.delta": await self.output_queue.put( ( self.output_sample_rate, np.frombuffer( base64.b64decode(event.delta), dtype=np.int16 ).reshape(1, -1), ), ) # Handle function calls (only in non-interpretation mode) elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode: print(f"Function call started") self.function_call_in_progress = True self.current_function_args = "" self.current_call_id = getattr(event, 'call_id', None) elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode: if self.function_call_in_progress: self.current_function_args += event.delta elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode: if self.function_call_in_progress: print(f"Function call done, args: {self.current_function_args}") try: args = json.loads(self.current_function_args) query = args.get("query", "") # Emit search event to client await self.output_queue.put(AdditionalOutputs({ "type": "search", "query": query })) # Perform the search search_results = await self.search_web(query) print(f"Search results length: {len(search_results)}") # Send function result back to the model if self.connection and self.current_call_id: await self.connection.conversation.item.create( item={ "type": "function_call_output", "call_id": self.current_call_id, "output": search_results } ) await self.connection.response.create() except Exception as e: print(f"Function call error: {e}") finally: self.function_call_in_progress = False self.current_function_args = "" self.current_call_id = None async def receive(self, frame: tuple[int, np.ndarray]) -> None: if not self.connection: return try: _, array = frame array = array.squeeze() audio_message = base64.b64encode(array.tobytes()).decode("utf-8") await self.connection.input_audio_buffer.append(audio=audio_message) except Exception as e: print(f"Error in receive: {e}") # Connection might be closed, ignore the error async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None: item = await wait_for_item(self.output_queue) # Check if it's a dict with text message if isinstance(item, dict) and item.get('type') == 'text_message': await self.process_text_message(item['content']) return None return item async def shutdown(self) -> None: if self.connection: await self.connection.close() self.connection = None # Create initial handler instance handler = OpenAIHandler(web_search_enabled=False, interpretation_mode=False) # Create components chatbot = gr.Chatbot(type="messages") # Create stream with handler instance stream = Stream( handler, # Pass instance, not factory mode="send-receive", modality="audio", additional_inputs=[chatbot], additional_outputs=[chatbot], additional_outputs_handler=update_chatbot, rtc_configuration=get_twilio_turn_credentials() if get_space() else None, concurrency_limit=5 if get_space() else None, time_limit=300 if get_space() else None, ) app = FastAPI() # Mount stream stream.mount(app) # Intercept offer to capture settings @app.post("/webrtc/offer", include_in_schema=False) async def custom_offer(request: Request): """Intercept offer to capture settings""" body = await request.json() webrtc_id = body.get("webrtc_id") web_search_enabled = body.get("web_search_enabled", False) target_language = body.get("target_language", "") system_prompt = body.get("system_prompt", "") interpretation_mode = body.get("interpretation_mode", False) interpretation_language = body.get("interpretation_language", "") print(f"Custom offer - webrtc_id: {webrtc_id}, web_search_enabled: {web_search_enabled}, " f"target_language: {target_language}, interpretation_mode: {interpretation_mode}, " f"interpretation_language: {interpretation_language}") # Store settings with timestamp if webrtc_id: connection_settings[webrtc_id] = { 'web_search_enabled': web_search_enabled, 'target_language': target_language, 'system_prompt': system_prompt, 'interpretation_mode': interpretation_mode, 'interpretation_language': interpretation_language, 'timestamp': asyncio.get_event_loop().time() } # Remove our custom route temporarily custom_route = None for i, route in enumerate(app.routes): if hasattr(route, 'path') and route.path == "/webrtc/offer" and route.endpoint == custom_offer: custom_route = app.routes.pop(i) break # Forward to stream's offer handler response = await stream.offer(body) # Re-add our custom route if custom_route: app.routes.insert(0, custom_route) return response @app.post("/chat/text") async def chat_text(request: Request): """Handle text chat messages using GPT-4o-mini""" try: body = await request.json() message = body.get("message", "") web_search_enabled = body.get("web_search_enabled", False) target_language = body.get("target_language", "") system_prompt = body.get("system_prompt", "") if not message: return {"error": "메시지가 비어있습니다."} # Process text chat result = await process_text_chat(message, web_search_enabled, target_language, system_prompt) return result except Exception as e: print(f"Error in chat_text endpoint: {e}") return {"error": "채팅 처리 중 오류가 발생했습니다."} @app.post("/text_message/{webrtc_id}") async def receive_text_message(webrtc_id: str, request: Request): """Receive text message from client""" body = await request.json() message = body.get("content", "") # Find the handler for this connection if webrtc_id in stream.handlers: handler = stream.handlers[webrtc_id] # Queue the text message for processing await handler.output_queue.put({ 'type': 'text_message', 'content': message }) return {"status": "ok"} @app.get("/outputs") async def outputs(webrtc_id: str): """Stream outputs including search events""" async def output_stream(): async for output in stream.output_stream(webrtc_id): if hasattr(output, 'args') and output.args: # Check if it's a search event if isinstance(output.args[0], dict) and output.args[0].get('type') == 'search': yield f"event: search\ndata: {json.dumps(output.args[0])}\n\n" # Regular transcript event with language info elif isinstance(output.args[0], dict) and 'event' in output.args[0]: event = output.args[0]['event'] if hasattr(event, 'transcript'): data = { "role": "assistant", "content": event.transcript, "language": output.args[0].get('language', '') } yield f"event: output\ndata: {json.dumps(data)}\n\n" return StreamingResponse(output_stream(), media_type="text/event-stream") @app.get("/") async def index(): """Serve the HTML page""" rtc_config = get_twilio_turn_credentials() if get_space() else None html_content = HTML_CONTENT.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config)) return HTMLResponse(content=html_content) if __name__ == "__main__": import uvicorn mode = os.getenv("MODE") if mode == "UI": stream.ui.launch(server_port=7860) elif mode == "PHONE": stream.fastphone(host="0.0.0.0", port=7860) else: uvicorn.run(app, host="0.0.0.0", port=7860)

MOUSE 음성 챗

설정

대화