import asyncio
import base64
import json
from pathlib import Path
import os
import numpy as np
import openai
from dotenv import load_dotenv
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse, StreamingResponse
from fastrtc import (
AdditionalOutputs,
AsyncStreamHandler,
Stream,
get_twilio_turn_credentials,
wait_for_item,
)
from gradio.utils import get_space
from openai.types.beta.realtime import ResponseAudioTranscriptDoneEvent
import httpx
from typing import Optional, List, Dict
import gradio as gr
load_dotenv()
SAMPLE_RATE = 24000
# Supported languages for OpenAI Realtime API
SUPPORTED_LANGUAGES = {
"ko": "한국어 (Korean)",
"en": "English",
"es": "Español (Spanish)",
"fr": "Français (French)",
"de": "Deutsch (German)",
"it": "Italiano (Italian)",
"pt": "Português (Portuguese)",
"ru": "Русский (Russian)",
"ja": "日本語 (Japanese)",
"zh": "中文 (Chinese)",
"ar": "العربية (Arabic)",
"hi": "हिन्दी (Hindi)",
"nl": "Nederlands (Dutch)",
"pl": "Polski (Polish)",
"tr": "Türkçe (Turkish)",
"vi": "Tiếng Việt (Vietnamese)",
"th": "ไทย (Thai)",
"id": "Bahasa Indonesia",
"sv": "Svenska (Swedish)",
"da": "Dansk (Danish)",
"no": "Norsk (Norwegian)",
"fi": "Suomi (Finnish)",
"he": "עברית (Hebrew)",
"uk": "Українська (Ukrainian)",
"cs": "Čeština (Czech)",
"el": "Ελληνικά (Greek)",
"ro": "Română (Romanian)",
"hu": "Magyar (Hungarian)",
"ms": "Bahasa Melayu (Malay)"
}
# HTML content embedded as a string
HTML_CONTENT = """
Mouth of 'MOUSE'
MOUSE 음성 챗
연결 대기 중
설정
웹 검색
자동 번역
자동 통역
통역 언어
통역 모드: 입력한 음성이 선택한 언어로 자동 통역됩니다.
대화
"""
class BraveSearchClient:
"""Brave Search API client"""
def __init__(self, api_key: str):
self.api_key = api_key
self.base_url = "https://api.search.brave.com/res/v1/web/search"
async def search(self, query: str, count: int = 10) -> List[Dict]:
"""Perform a web search using Brave Search API"""
if not self.api_key:
return []
headers = {
"Accept": "application/json",
"X-Subscription-Token": self.api_key
}
params = {
"q": query,
"count": count,
"lang": "ko"
}
async with httpx.AsyncClient() as client:
try:
response = await client.get(self.base_url, headers=headers, params=params)
response.raise_for_status()
data = response.json()
results = []
if "web" in data and "results" in data["web"]:
for result in data["web"]["results"][:count]:
results.append({
"title": result.get("title", ""),
"url": result.get("url", ""),
"description": result.get("description", "")
})
return results
except Exception as e:
print(f"Brave Search error: {e}")
return []
# Initialize search client globally
brave_api_key = os.getenv("BSEARCH_API")
search_client = BraveSearchClient(brave_api_key) if brave_api_key else None
print(f"Search client initialized: {search_client is not None}, API key present: {bool(brave_api_key)}")
# Store connection settings
connection_settings = {}
# Initialize OpenAI client for text chat
client = openai.AsyncOpenAI()
def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
chatbot.append({"role": "assistant", "content": response.transcript})
return chatbot
def get_translation_instructions(target_language: str) -> str:
"""Get instructions for translation based on target language"""
if not target_language:
return ""
language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
return (
f"\n\nIMPORTANT: You must respond in {language_name} ({target_language}). "
f"Translate all your responses to {language_name}."
)
async def process_text_chat(message: str, web_search_enabled: bool, target_language: str,
system_prompt: str) -> Dict[str, str]:
"""Process text chat using GPT-4o-mini model"""
try:
# Prepare system message
base_instructions = system_prompt or "You are a helpful assistant."
translation_instructions = get_translation_instructions(target_language)
messages = [
{"role": "system", "content": base_instructions + translation_instructions}
]
# Handle web search if enabled
if web_search_enabled and search_client:
# Check if the message requires web search
search_keywords = ["날씨", "기온", "비", "눈", "뉴스", "소식", "현재", "최근",
"오늘", "지금", "가격", "환율", "주가", "weather", "news",
"current", "today", "price", "2024", "2025"]
should_search = any(keyword in message.lower() for keyword in search_keywords)
if should_search:
# Perform web search
search_results = await search_client.search(message)
if search_results:
search_context = "웹 검색 결과:\n\n"
for i, result in enumerate(search_results[:5], 1):
search_context += f"{i}. {result['title']}\n{result['description']}\n\n"
messages.append({
"role": "system",
"content": f"다음 웹 검색 결과를 참고하여 답변하세요:\n\n{search_context}"
})
messages.append({"role": "user", "content": message})
# Call GPT-4o-mini
response = await client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
temperature=0.7,
max_tokens=2000
)
return {
"response": response.choices[0].message.content,
"language": SUPPORTED_LANGUAGES.get(target_language, "") if target_language else ""
}
except Exception as e:
print(f"Error in text chat: {e}")
return {"error": str(e)}
class OpenAIHandler(AsyncStreamHandler):
def __init__(self, web_search_enabled: bool = False, target_language: str = "",
system_prompt: str = "", webrtc_id: str = None,
interpretation_mode: bool = False, interpretation_language: str = "") -> None:
super().__init__(
expected_layout="mono",
output_sample_rate=SAMPLE_RATE,
output_frame_size=480,
input_sample_rate=SAMPLE_RATE,
)
self.connection = None
self.output_queue = asyncio.Queue()
self.search_client = search_client
self.function_call_in_progress = False
self.current_function_args = ""
self.current_call_id = None
self.webrtc_id = webrtc_id
self.web_search_enabled = web_search_enabled
self.target_language = target_language
self.system_prompt = system_prompt
self.interpretation_mode = interpretation_mode
self.interpretation_language = interpretation_language
print(f"Handler created with web_search_enabled={web_search_enabled}, "
f"target_language={target_language}, webrtc_id={webrtc_id}, "
f"interpretation_mode={interpretation_mode}, interpretation_language={interpretation_language}")
def copy(self):
# Get the most recent settings
if connection_settings:
# Get the most recent webrtc_id
recent_ids = sorted(connection_settings.keys(),
key=lambda k: connection_settings[k].get('timestamp', 0),
reverse=True)
if recent_ids:
recent_id = recent_ids[0]
settings = connection_settings[recent_id]
return OpenAIHandler(
web_search_enabled=settings.get('web_search_enabled', False),
target_language=settings.get('target_language', ''),
system_prompt=settings.get('system_prompt', ''),
webrtc_id=recent_id,
interpretation_mode=settings.get('interpretation_mode', False),
interpretation_language=settings.get('interpretation_language', '')
)
print(f"Handler.copy() called - creating new handler with default settings")
return OpenAIHandler(web_search_enabled=False)
async def search_web(self, query: str) -> str:
"""Perform web search and return formatted results"""
if not self.search_client or not self.web_search_enabled:
return "웹 검색이 비활성화되어 있습니다."
print(f"Searching web for: {query}")
results = await self.search_client.search(query)
if not results:
return f"'{query}'에 대한 검색 결과를 찾을 수 없습니다."
# Format search results
formatted_results = []
for i, result in enumerate(results, 1):
formatted_results.append(
f"{i}. {result['title']}\n"
f" URL: {result['url']}\n"
f" {result['description']}\n"
)
return f"웹 검색 결과 '{query}':\n\n" + "\n".join(formatted_results)
async def process_text_message(self, message: str):
"""Process text message from user"""
if self.connection:
await self.connection.conversation.item.create(
item={
"type": "message",
"role": "user",
"content": [{"type": "input_text", "text": message}]
}
)
await self.connection.response.create()
def get_interpretation_instructions(self):
"""Get instructions for interpretation mode"""
if not self.interpretation_mode or not self.interpretation_language:
return ""
target_language_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
target_code = self.interpretation_language
# Language-specific examples
examples = {
"en": "Hello, the weather is nice today",
"es": "Hola, el clima está agradable hoy",
"fr": "Bonjour, il fait beau aujourd'hui",
"de": "Hallo, das Wetter ist heute schön",
"ja": "こんにちは、今日はいい天気ですね",
"zh": "你好,今天天气很好"
}
example_translation = examples.get(target_code, "Hello, the weather is nice today")
return (
f"INTERPRETATION MODE - CRITICAL RULES:\n\n"
f"1. You are ONLY a translator to {target_language_name} (language code: {target_code}).\n"
f"2. NEVER respond in any other language.\n"
f"3. NEVER generate conversation or additional content.\n"
f"4. ONLY translate what the user says.\n"
f"5. STOP immediately after translating.\n\n"
f"Example:\n"
f"If user says: '안녕하세요, 오늘 날씨가 좋네요'\n"
f"You MUST respond ONLY: '{example_translation}'\n\n"
f"DO NOT say anything else. DO NOT continue talking.\n"
f"Your output language MUST be {target_language_name} ONLY."
)
def get_translation_instructions(self):
"""Get instructions for translation based on target language"""
if not self.target_language or self.interpretation_mode:
return ""
language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
return (
f"\n\nIMPORTANT: You must respond in {language_name} ({self.target_language}). "
f"Translate all your responses to {language_name}. "
f"This includes both spoken and written responses."
)
async def start_up(self):
"""Connect to realtime API with function calling enabled"""
# First check if we have the most recent settings
if connection_settings:
recent_ids = sorted(connection_settings.keys(),
key=lambda k: connection_settings[k].get('timestamp', 0),
reverse=True)
if recent_ids:
recent_id = recent_ids[0]
settings = connection_settings[recent_id]
self.web_search_enabled = settings.get('web_search_enabled', False)
self.target_language = settings.get('target_language', '')
self.system_prompt = settings.get('system_prompt', '')
self.interpretation_mode = settings.get('interpretation_mode', False)
self.interpretation_language = settings.get('interpretation_language', '')
self.webrtc_id = recent_id
print(f"start_up: Updated settings from storage - webrtc_id={self.webrtc_id}, "
f"web_search_enabled={self.web_search_enabled}, target_language={self.target_language}, "
f"interpretation_mode={self.interpretation_mode}")
print(f"Handler interpretation settings: mode={self.interpretation_mode}, language={self.interpretation_language}")
print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
f"interpretation_language={self.interpretation_language}")
self.client = openai.AsyncOpenAI()
# Define the web search function
tools = []
base_instructions = self.system_prompt or "You are a helpful assistant."
# Check if in interpretation mode
if self.interpretation_mode:
# In interpretation mode, override all instructions
base_instructions = (
f"You are a professional interpreter. Your ONLY task is to translate what the user says "
f"into {SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)}. "
f"Do not add any commentary, do not continue the conversation, do not generate new content. "
f"Simply translate what was said and stop."
)
interpretation_instructions = self.get_interpretation_instructions()
instructions = base_instructions + "\n\n" + interpretation_instructions
# No tools in interpretation mode
tools = []
print(f"Interpretation mode active - target language: {self.interpretation_language}")
else:
# Normal mode - add translation instructions if language is selected
translation_instructions = self.get_translation_instructions()
if self.web_search_enabled and self.search_client:
tools = [{
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query"
}
},
"required": ["query"]
}
}
}]
print("Web search function added to tools")
search_instructions = (
"\n\nYou have web search capabilities. "
"IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
"- Weather (날씨, 기온, 비, 눈)\n"
"- News (뉴스, 소식)\n"
"- Current events (현재, 최근, 오늘, 지금)\n"
"- Prices (가격, 환율, 주가)\n"
"- Sports scores or results\n"
"- Any question about 2024 or 2025\n"
"- Any time-sensitive information\n\n"
"When in doubt, USE web_search. It's better to search and provide accurate information "
"than to guess or use outdated information."
)
instructions = base_instructions + search_instructions + translation_instructions
else:
instructions = base_instructions + translation_instructions
async with self.client.beta.realtime.connect(
model="gpt-4o-mini-realtime-preview-2024-12-17"
) as conn:
# Update session with tools
session_update = {
"turn_detection": {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 500 if self.interpretation_mode else 700
},
"instructions": instructions,
"tools": tools,
"tool_choice": "auto" if tools else "none"
}
# Add voice setting based on interpretation or translation language
voice_language = self.interpretation_language if self.interpretation_mode else self.target_language
if voice_language:
# Use only alloy voice to avoid language confusion
# The model will handle the language based on instructions
session_update["voice"] = "alloy"
# For interpretation mode, explicitly set the output language
if self.interpretation_mode:
session_update["output_audio_format"] = "pcm16"
print(f"Voice set to: alloy for language: {voice_language}")
# For interpretation mode, ensure proper language settings
if self.interpretation_mode and self.interpretation_language:
session_update["modalities"] = ["text", "audio"]
session_update["temperature"] = 0.3 # Lower temperature for more accurate translation
session_update["max_response_output_tokens"] = 500 # Limit output to prevent long generations
print(f"Interpretation session config: voice={session_update.get('voice')}, lang={self.interpretation_language}")
await conn.session.update(session=session_update)
self.connection = conn
print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}, "
f"interpretation_mode: {self.interpretation_mode}, language: {self.interpretation_language if self.interpretation_mode else self.target_language}")
async for event in self.connection:
# Debug logging for function calls
if event.type.startswith("response.function_call"):
print(f"Function event: {event.type}")
if event.type == "response.audio_transcript.done":
output_data = {
"event": event,
"language": SUPPORTED_LANGUAGES.get(
self.interpretation_language if self.interpretation_mode else self.target_language,
""
) if (self.interpretation_language or self.target_language) else ""
}
await self.output_queue.put(AdditionalOutputs(output_data))
elif event.type == "response.audio.delta":
await self.output_queue.put(
(
self.output_sample_rate,
np.frombuffer(
base64.b64decode(event.delta), dtype=np.int16
).reshape(1, -1),
),
)
# Handle function calls (only in non-interpretation mode)
elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
print(f"Function call started")
self.function_call_in_progress = True
self.current_function_args = ""
self.current_call_id = getattr(event, 'call_id', None)
elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
if self.function_call_in_progress:
self.current_function_args += event.delta
elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
if self.function_call_in_progress:
print(f"Function call done, args: {self.current_function_args}")
try:
args = json.loads(self.current_function_args)
query = args.get("query", "")
# Emit search event to client
await self.output_queue.put(AdditionalOutputs({
"type": "search",
"query": query
}))
# Perform the search
search_results = await self.search_web(query)
print(f"Search results length: {len(search_results)}")
# Send function result back to the model
if self.connection and self.current_call_id:
await self.connection.conversation.item.create(
item={
"type": "function_call_output",
"call_id": self.current_call_id,
"output": search_results
}
)
await self.connection.response.create()
except Exception as e:
print(f"Function call error: {e}")
finally:
self.function_call_in_progress = False
self.current_function_args = ""
self.current_call_id = None
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
if not self.connection:
return
try:
_, array = frame
array = array.squeeze()
audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
await self.connection.input_audio_buffer.append(audio=audio_message)
except Exception as e:
print(f"Error in receive: {e}")
# Connection might be closed, ignore the error
async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
item = await wait_for_item(self.output_queue)
# Check if it's a dict with text message
if isinstance(item, dict) and item.get('type') == 'text_message':
await self.process_text_message(item['content'])
return None
return item
async def shutdown(self) -> None:
if self.connection:
await self.connection.close()
self.connection = None
# Create initial handler instance
handler = OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
# Create components
chatbot = gr.Chatbot(type="messages")
# Create stream with handler instance
stream = Stream(
handler, # Pass instance, not factory
mode="send-receive",
modality="audio",
additional_inputs=[chatbot],
additional_outputs=[chatbot],
additional_outputs_handler=update_chatbot,
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
concurrency_limit=5 if get_space() else None,
time_limit=300 if get_space() else None,
)
app = FastAPI()
# Mount stream
stream.mount(app)
# Intercept offer to capture settings
@app.post("/webrtc/offer", include_in_schema=False)
async def custom_offer(request: Request):
"""Intercept offer to capture settings"""
body = await request.json()
webrtc_id = body.get("webrtc_id")
web_search_enabled = body.get("web_search_enabled", False)
target_language = body.get("target_language", "")
system_prompt = body.get("system_prompt", "")
interpretation_mode = body.get("interpretation_mode", False)
interpretation_language = body.get("interpretation_language", "")
print(f"Custom offer - webrtc_id: {webrtc_id}, web_search_enabled: {web_search_enabled}, "
f"target_language: {target_language}, interpretation_mode: {interpretation_mode}, "
f"interpretation_language: {interpretation_language}")
# Store settings with timestamp
if webrtc_id:
connection_settings[webrtc_id] = {
'web_search_enabled': web_search_enabled,
'target_language': target_language,
'system_prompt': system_prompt,
'interpretation_mode': interpretation_mode,
'interpretation_language': interpretation_language,
'timestamp': asyncio.get_event_loop().time()
}
# Remove our custom route temporarily
custom_route = None
for i, route in enumerate(app.routes):
if hasattr(route, 'path') and route.path == "/webrtc/offer" and route.endpoint == custom_offer:
custom_route = app.routes.pop(i)
break
# Forward to stream's offer handler
response = await stream.offer(body)
# Re-add our custom route
if custom_route:
app.routes.insert(0, custom_route)
return response
@app.post("/chat/text")
async def chat_text(request: Request):
"""Handle text chat messages using GPT-4o-mini"""
try:
body = await request.json()
message = body.get("message", "")
web_search_enabled = body.get("web_search_enabled", False)
target_language = body.get("target_language", "")
system_prompt = body.get("system_prompt", "")
if not message:
return {"error": "메시지가 비어있습니다."}
# Process text chat
result = await process_text_chat(message, web_search_enabled, target_language, system_prompt)
return result
except Exception as e:
print(f"Error in chat_text endpoint: {e}")
return {"error": "채팅 처리 중 오류가 발생했습니다."}
@app.post("/text_message/{webrtc_id}")
async def receive_text_message(webrtc_id: str, request: Request):
"""Receive text message from client"""
body = await request.json()
message = body.get("content", "")
# Find the handler for this connection
if webrtc_id in stream.handlers:
handler = stream.handlers[webrtc_id]
# Queue the text message for processing
await handler.output_queue.put({
'type': 'text_message',
'content': message
})
return {"status": "ok"}
@app.get("/outputs")
async def outputs(webrtc_id: str):
"""Stream outputs including search events"""
async def output_stream():
async for output in stream.output_stream(webrtc_id):
if hasattr(output, 'args') and output.args:
# Check if it's a search event
if isinstance(output.args[0], dict) and output.args[0].get('type') == 'search':
yield f"event: search\ndata: {json.dumps(output.args[0])}\n\n"
# Regular transcript event with language info
elif isinstance(output.args[0], dict) and 'event' in output.args[0]:
event = output.args[0]['event']
if hasattr(event, 'transcript'):
data = {
"role": "assistant",
"content": event.transcript,
"language": output.args[0].get('language', '')
}
yield f"event: output\ndata: {json.dumps(data)}\n\n"
return StreamingResponse(output_stream(), media_type="text/event-stream")
@app.get("/")
async def index():
"""Serve the HTML page"""
rtc_config = get_twilio_turn_credentials() if get_space() else None
html_content = HTML_CONTENT.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
return HTMLResponse(content=html_content)
if __name__ == "__main__":
import uvicorn
mode = os.getenv("MODE")
if mode == "UI":
stream.ui.launch(server_port=7860)
elif mode == "PHONE":
stream.fastphone(host="0.0.0.0", port=7860)
else:
uvicorn.run(app, host="0.0.0.0", port=7860)