seawolf2357 commited on
Commit
64bea29
·
verified ·
1 Parent(s): 0aafe09

Create app-backup2.py

Browse files
Files changed (1) hide show
  1. app-backup2.py +2163 -0
app-backup2.py ADDED
@@ -0,0 +1,2163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ import json
4
+ from pathlib import Path
5
+ import os
6
+ import numpy as np
7
+ import openai
8
+ from dotenv import load_dotenv
9
+ from fastapi import FastAPI, Request
10
+ from fastapi.responses import HTMLResponse, StreamingResponse
11
+ from fastrtc import (
12
+ AdditionalOutputs,
13
+ AsyncStreamHandler,
14
+ Stream,
15
+ get_twilio_turn_credentials,
16
+ wait_for_item,
17
+ )
18
+ from gradio.utils import get_space
19
+ from openai.types.beta.realtime import ResponseAudioTranscriptDoneEvent
20
+ import httpx
21
+ from typing import Optional, List, Dict
22
+ import gradio as gr
23
+ import io
24
+ from scipy import signal
25
+ import wave
26
+
27
+ load_dotenv()
28
+
29
+ SAMPLE_RATE = 24000
30
+
31
+ # Supported languages for OpenAI Realtime API
32
+ SUPPORTED_LANGUAGES = {
33
+ "ko": "한국어 (Korean)",
34
+ "en": "English",
35
+ "es": "Español (Spanish)",
36
+ "fr": "Français (French)",
37
+ "de": "Deutsch (German)",
38
+ "it": "Italiano (Italian)",
39
+ "pt": "Português (Portuguese)",
40
+ "ru": "Русский (Russian)",
41
+ "ja": "日本語 (Japanese)",
42
+ "zh": "中文 (Chinese)",
43
+ "ar": "العربية (Arabic)",
44
+ "hi": "हिन्दी (Hindi)",
45
+ "nl": "Nederlands (Dutch)",
46
+ "pl": "Polski (Polish)",
47
+ "tr": "Türkçe (Turkish)",
48
+ "vi": "Tiếng Việt (Vietnamese)",
49
+ "th": "ไทย (Thai)",
50
+ "id": "Bahasa Indonesia",
51
+ "sv": "Svenska (Swedish)",
52
+ "da": "Dansk (Danish)",
53
+ "no": "Norsk (Norwegian)",
54
+ "fi": "Suomi (Finnish)",
55
+ "he": "עברית (Hebrew)",
56
+ "uk": "Українська (Ukrainian)",
57
+ "cs": "Čeština (Czech)",
58
+ "el": "Ελληνικά (Greek)",
59
+ "ro": "Română (Romanian)",
60
+ "hu": "Magyar (Hungarian)",
61
+ "ms": "Bahasa Melayu (Malay)"
62
+ }
63
+
64
+ # HTML content embedded as a string
65
+ HTML_CONTENT = """<!DOCTYPE html>
66
+ <html lang="ko">
67
+
68
+ <head>
69
+ <meta charset="UTF-8">
70
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
71
+ <title>Mouth of 'MOUSE'</title>
72
+ <style>
73
+ :root {
74
+ --primary-color: #6f42c1;
75
+ --secondary-color: #563d7c;
76
+ --dark-bg: #121212;
77
+ --card-bg: #1e1e1e;
78
+ --text-color: #f8f9fa;
79
+ --border-color: #333;
80
+ --hover-color: #8a5cf6;
81
+ }
82
+ body {
83
+ font-family: "SF Pro Display", -apple-system, BlinkMacSystemFont, sans-serif;
84
+ background-color: var(--dark-bg);
85
+ color: var(--text-color);
86
+ margin: 0;
87
+ padding: 0;
88
+ height: 100vh;
89
+ display: flex;
90
+ flex-direction: column;
91
+ overflow: hidden;
92
+ }
93
+ .container {
94
+ max-width: 1400px;
95
+ margin: 0 auto;
96
+ padding: 20px;
97
+ flex-grow: 1;
98
+ display: flex;
99
+ flex-direction: column;
100
+ width: 100%;
101
+ height: 100vh;
102
+ box-sizing: border-box;
103
+ overflow: hidden;
104
+ }
105
+ .header {
106
+ text-align: center;
107
+ padding: 15px 0;
108
+ border-bottom: 1px solid var(--border-color);
109
+ margin-bottom: 20px;
110
+ flex-shrink: 0;
111
+ background-color: var(--card-bg);
112
+ }
113
+ .main-content {
114
+ display: flex;
115
+ gap: 20px;
116
+ flex-grow: 1;
117
+ min-height: 0;
118
+ overflow: hidden;
119
+ }
120
+ .sidebar {
121
+ width: 350px;
122
+ flex-shrink: 0;
123
+ display: flex;
124
+ flex-direction: column;
125
+ gap: 20px;
126
+ overflow-y: auto;
127
+ max-height: calc(100vh - 120px);
128
+ }
129
+ .chat-section {
130
+ flex-grow: 1;
131
+ display: flex;
132
+ flex-direction: column;
133
+ min-width: 0;
134
+ }
135
+ .logo {
136
+ display: flex;
137
+ align-items: center;
138
+ justify-content: center;
139
+ gap: 10px;
140
+ }
141
+ .logo h1 {
142
+ margin: 0;
143
+ background: linear-gradient(135deg, var(--primary-color), #a78bfa);
144
+ -webkit-background-clip: text;
145
+ background-clip: text;
146
+ color: transparent;
147
+ font-size: 32px;
148
+ letter-spacing: 1px;
149
+ }
150
+ /* Settings section */
151
+ .settings-section {
152
+ background-color: var(--card-bg);
153
+ border-radius: 12px;
154
+ padding: 20px;
155
+ border: 1px solid var(--border-color);
156
+ overflow-y: auto;
157
+ flex-grow: 1;
158
+ }
159
+ .settings-grid {
160
+ display: flex;
161
+ flex-direction: column;
162
+ gap: 15px;
163
+ margin-bottom: 15px;
164
+ }
165
+ .interpretation-section {
166
+ display: flex;
167
+ flex-direction: column;
168
+ gap: 15px;
169
+ padding: 15px;
170
+ background-color: var(--dark-bg);
171
+ border-radius: 8px;
172
+ margin-top: 15px;
173
+ }
174
+ .interpretation-info {
175
+ font-size: 13px;
176
+ color: #999;
177
+ margin-top: 5px;
178
+ }
179
+ .setting-item {
180
+ display: flex;
181
+ align-items: center;
182
+ justify-content: space-between;
183
+ gap: 10px;
184
+ }
185
+ .setting-label {
186
+ font-size: 14px;
187
+ color: #aaa;
188
+ min-width: 60px;
189
+ }
190
+ /* Toggle switch */
191
+ .toggle-switch {
192
+ position: relative;
193
+ width: 50px;
194
+ height: 26px;
195
+ background-color: #ccc;
196
+ border-radius: 13px;
197
+ cursor: pointer;
198
+ transition: background-color 0.3s;
199
+ }
200
+ .toggle-switch.active {
201
+ background-color: var(--primary-color);
202
+ }
203
+ .toggle-slider {
204
+ position: absolute;
205
+ top: 3px;
206
+ left: 3px;
207
+ width: 20px;
208
+ height: 20px;
209
+ background-color: white;
210
+ border-radius: 50%;
211
+ transition: transform 0.3s;
212
+ }
213
+ .toggle-switch.active .toggle-slider {
214
+ transform: translateX(24px);
215
+ }
216
+ /* Select dropdown */
217
+ select {
218
+ background-color: var(--card-bg);
219
+ color: var(--text-color);
220
+ border: 1px solid var(--border-color);
221
+ padding: 8px 12px;
222
+ border-radius: 6px;
223
+ font-size: 14px;
224
+ cursor: pointer;
225
+ min-width: 120px;
226
+ max-width: 200px;
227
+ }
228
+ select:focus {
229
+ outline: none;
230
+ border-color: var(--primary-color);
231
+ }
232
+ /* Text inputs */
233
+ .text-input-section {
234
+ margin-top: 15px;
235
+ }
236
+ input[type="text"], textarea {
237
+ width: 100%;
238
+ background-color: var(--dark-bg);
239
+ color: var(--text-color);
240
+ border: 1px solid var(--border-color);
241
+ padding: 10px;
242
+ border-radius: 6px;
243
+ font-size: 14px;
244
+ box-sizing: border-box;
245
+ margin-top: 5px;
246
+ }
247
+ input[type="text"]:focus, textarea:focus {
248
+ outline: none;
249
+ border-color: var(--primary-color);
250
+ }
251
+ textarea {
252
+ resize: vertical;
253
+ min-height: 80px;
254
+ }
255
+ .chat-container {
256
+ border-radius: 12px;
257
+ background-color: var(--card-bg);
258
+ box-shadow: 0 8px 32px rgba(0, 0, 0, 0.2);
259
+ padding: 20px;
260
+ flex-grow: 1;
261
+ display: flex;
262
+ flex-direction: column;
263
+ border: 1px solid var(--border-color);
264
+ overflow: hidden;
265
+ min-height: 0;
266
+ height: 100%;
267
+ }
268
+ .chat-messages {
269
+ flex-grow: 1;
270
+ overflow-y: auto;
271
+ padding: 15px;
272
+ scrollbar-width: thin;
273
+ scrollbar-color: var(--primary-color) var(--card-bg);
274
+ min-height: 0;
275
+ max-height: calc(100vh - 250px);
276
+ }
277
+ .chat-messages::-webkit-scrollbar {
278
+ width: 6px;
279
+ }
280
+ .chat-messages::-webkit-scrollbar-thumb {
281
+ background-color: var(--primary-color);
282
+ border-radius: 6px;
283
+ }
284
+ .message {
285
+ margin-bottom: 15px;
286
+ padding: 12px 16px;
287
+ border-radius: 8px;
288
+ font-size: 15px;
289
+ line-height: 1.5;
290
+ position: relative;
291
+ max-width: 85%;
292
+ animation: fade-in 0.3s ease-out;
293
+ word-wrap: break-word;
294
+ }
295
+ @keyframes fade-in {
296
+ from {
297
+ opacity: 0;
298
+ transform: translateY(10px);
299
+ }
300
+ to {
301
+ opacity: 1;
302
+ transform: translateY(0);
303
+ }
304
+ }
305
+ .message.user {
306
+ background: linear-gradient(135deg, #2c3e50, #34495e);
307
+ margin-left: auto;
308
+ border-bottom-right-radius: 2px;
309
+ }
310
+ .message.assistant {
311
+ background: linear-gradient(135deg, var(--secondary-color), var(--primary-color));
312
+ margin-right: auto;
313
+ border-bottom-left-radius: 2px;
314
+ }
315
+ .message.search-result {
316
+ background: linear-gradient(135deg, #1a5a3e, #2e7d32);
317
+ font-size: 14px;
318
+ padding: 10px;
319
+ margin-bottom: 10px;
320
+ }
321
+ .message.assistant.interpretation {
322
+ background: linear-gradient(135deg, #1a5a3e, #2e7d32);
323
+ font-style: italic;
324
+ }
325
+ .interpretation-arrow {
326
+ color: #4caf50;
327
+ font-weight: bold;
328
+ margin: 0 10px;
329
+ }
330
+ .controls {
331
+ text-align: center;
332
+ margin-top: auto;
333
+ display: flex;
334
+ justify-content: center;
335
+ gap: 10px;
336
+ flex-shrink: 0;
337
+ padding-top: 20px;
338
+ }
339
+ /* Responsive design */
340
+ @media (max-width: 1024px) {
341
+ .sidebar {
342
+ width: 300px;
343
+ }
344
+ }
345
+ @media (max-width: 768px) {
346
+ .main-content {
347
+ flex-direction: column;
348
+ }
349
+ .sidebar {
350
+ width: 100%;
351
+ margin-bottom: 20px;
352
+ }
353
+ .chat-section {
354
+ height: 400px;
355
+ }
356
+ }
357
+ button {
358
+ background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
359
+ color: white;
360
+ border: none;
361
+ padding: 14px 28px;
362
+ font-family: inherit;
363
+ font-size: 16px;
364
+ cursor: pointer;
365
+ transition: all 0.3s;
366
+ text-transform: uppercase;
367
+ letter-spacing: 1px;
368
+ border-radius: 50px;
369
+ display: flex;
370
+ align-items: center;
371
+ justify-content: center;
372
+ gap: 10px;
373
+ box-shadow: 0 4px 10px rgba(111, 66, 193, 0.3);
374
+ }
375
+ button:hover {
376
+ transform: translateY(-2px);
377
+ box-shadow: 0 6px 15px rgba(111, 66, 193, 0.5);
378
+ background: linear-gradient(135deg, var(--hover-color), var(--primary-color));
379
+ }
380
+ button:active {
381
+ transform: translateY(1px);
382
+ }
383
+ #send-button {
384
+ background: linear-gradient(135deg, #2ecc71, #27ae60);
385
+ padding: 10px 20px;
386
+ font-size: 14px;
387
+ flex-shrink: 0;
388
+ }
389
+ #send-button:hover {
390
+ background: linear-gradient(135deg, #27ae60, #229954);
391
+ }
392
+ #audio-output {
393
+ display: none;
394
+ }
395
+ .icon-with-spinner {
396
+ display: flex;
397
+ align-items: center;
398
+ justify-content: center;
399
+ gap: 12px;
400
+ min-width: 180px;
401
+ }
402
+ .spinner {
403
+ width: 20px;
404
+ height: 20px;
405
+ border: 2px solid #ffffff;
406
+ border-top-color: transparent;
407
+ border-radius: 50%;
408
+ animation: spin 1s linear infinite;
409
+ flex-shrink: 0;
410
+ }
411
+ @keyframes spin {
412
+ to {
413
+ transform: rotate(360deg);
414
+ }
415
+ }
416
+ .audio-visualizer {
417
+ display: flex;
418
+ align-items: center;
419
+ justify-content: center;
420
+ gap: 5px;
421
+ min-width: 80px;
422
+ height: 25px;
423
+ }
424
+ .visualizer-bar {
425
+ width: 4px;
426
+ height: 100%;
427
+ background-color: rgba(255, 255, 255, 0.7);
428
+ border-radius: 2px;
429
+ transform-origin: bottom;
430
+ transform: scaleY(0.1);
431
+ transition: transform 0.1s ease;
432
+ }
433
+ .toast {
434
+ position: fixed;
435
+ top: 20px;
436
+ left: 50%;
437
+ transform: translateX(-50%);
438
+ padding: 16px 24px;
439
+ border-radius: 8px;
440
+ font-size: 14px;
441
+ z-index: 1000;
442
+ display: none;
443
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
444
+ }
445
+ .toast.error {
446
+ background-color: #f44336;
447
+ color: white;
448
+ }
449
+ .toast.warning {
450
+ background-color: #ff9800;
451
+ color: white;
452
+ }
453
+ .status-indicator {
454
+ display: inline-flex;
455
+ align-items: center;
456
+ margin-top: 10px;
457
+ font-size: 14px;
458
+ color: #aaa;
459
+ }
460
+ .status-dot {
461
+ width: 8px;
462
+ height: 8px;
463
+ border-radius: 50%;
464
+ margin-right: 8px;
465
+ }
466
+ .status-dot.connected {
467
+ background-color: #4caf50;
468
+ }
469
+ .status-dot.disconnected {
470
+ background-color: #f44336;
471
+ }
472
+ .status-dot.connecting {
473
+ background-color: #ff9800;
474
+ animation: pulse 1.5s infinite;
475
+ }
476
+ @keyframes pulse {
477
+ 0% {
478
+ opacity: 0.6;
479
+ }
480
+ 50% {
481
+ opacity: 1;
482
+ }
483
+ 100% {
484
+ opacity: 0.6;
485
+ }
486
+ }
487
+ .mouse-logo {
488
+ position: relative;
489
+ width: 40px;
490
+ height: 40px;
491
+ }
492
+ .mouse-ears {
493
+ position: absolute;
494
+ width: 15px;
495
+ height: 15px;
496
+ background-color: var(--primary-color);
497
+ border-radius: 50%;
498
+ }
499
+ .mouse-ear-left {
500
+ top: 0;
501
+ left: 5px;
502
+ }
503
+ .mouse-ear-right {
504
+ top: 0;
505
+ right: 5px;
506
+ }
507
+ .mouse-face {
508
+ position: absolute;
509
+ top: 10px;
510
+ left: 5px;
511
+ width: 30px;
512
+ height: 30px;
513
+ background-color: var(--secondary-color);
514
+ border-radius: 50%;
515
+ }
516
+ .language-info {
517
+ font-size: 12px;
518
+ color: #888;
519
+ margin-left: 5px;
520
+ }
521
+ </style>
522
+ </head>
523
+
524
+ <body>
525
+ <div id="error-toast" class="toast"></div>
526
+ <div class="container">
527
+ <div class="header">
528
+ <div class="logo">
529
+ <div class="mouse-logo">
530
+ <div class="mouse-ears mouse-ear-left"></div>
531
+ <div class="mouse-ears mouse-ear-right"></div>
532
+ <div class="mouse-face"></div>
533
+ </div>
534
+ <h1>MOUSE 음성 챗</h1>
535
+ </div>
536
+ <div class="status-indicator">
537
+ <div id="status-dot" class="status-dot disconnected"></div>
538
+ <span id="status-text">연결 대기 중</span>
539
+ </div>
540
+ </div>
541
+
542
+ <div class="main-content">
543
+ <div class="sidebar">
544
+ <div class="settings-section">
545
+ <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">설정</h3>
546
+ <div class="settings-grid">
547
+ <div class="setting-item">
548
+ <span class="setting-label">웹 검색</span>
549
+ <div id="search-toggle" class="toggle-switch">
550
+ <div class="toggle-slider"></div>
551
+ </div>
552
+ </div>
553
+ <div class="setting-item">
554
+ <span class="setting-label">자동 번역</span>
555
+ <select id="language-select">
556
+ <option value="">비활성화</option>
557
+ <option value="ko">한국어 (Korean)</option>
558
+ <option value="en">English</option>
559
+ <option value="es">Español (Spanish)</option>
560
+ <option value="fr">Français (French)</option>
561
+ <option value="de">Deutsch (German)</option>
562
+ <option value="it">Italiano (Italian)</option>
563
+ <option value="pt">Português (Portuguese)</option>
564
+ <option value="ru">Русский (Russian)</option>
565
+ <option value="ja">日本語 (Japanese)</option>
566
+ <option value="zh">中文 (Chinese)</option>
567
+ <option value="ar">العربية (Arabic)</option>
568
+ <option value="hi">हिन्दी (Hindi)</option>
569
+ <option value="nl">Nederlands (Dutch)</option>
570
+ <option value="pl">Polski (Polish)</option>
571
+ <option value="tr">Türkçe (Turkish)</option>
572
+ <option value="vi">Tiếng Việt (Vietnamese)</option>
573
+ <option value="th">ไทย (Thai)</option>
574
+ <option value="id">Bahasa Indonesia</option>
575
+ <option value="sv">Svenska (Swedish)</option>
576
+ <option value="da">Dansk (Danish)</option>
577
+ <option value="no">Norsk (Norwegian)</option>
578
+ <option value="fi">Suomi (Finnish)</option>
579
+ <option value="he">עברית (Hebrew)</option>
580
+ <option value="uk">Українська (Ukrainian)</option>
581
+ <option value="cs">Čeština (Czech)</option>
582
+ <option value="el">Ελληνικά (Greek)</option>
583
+ <option value="ro">Română (Romanian)</option>
584
+ <option value="hu">Magyar (Hungarian)</option>
585
+ <option value="ms">Bahasa Melayu (Malay)</option>
586
+ </select>
587
+ </div>
588
+ </div>
589
+ <div class="interpretation-section">
590
+ <div class="setting-item">
591
+ <span class="setting-label">자동 통역</span>
592
+ <div id="interpretation-toggle" class="toggle-switch">
593
+ <div class="toggle-slider"></div>
594
+ </div>
595
+ </div>
596
+ <div class="setting-item" id="interpretation-language-container" style="display: none;">
597
+ <span class="setting-label">통역 언어</span>
598
+ <select id="interpretation-language-select">
599
+ <option value="">언어 선택</option>
600
+ <option value="ko">한국어 (Korean)</option>
601
+ <option value="en">English</option>
602
+ <option value="es">Español (Spanish)</option>
603
+ <option value="fr">Français (French)</option>
604
+ <option value="de">Deutsch (German)</option>
605
+ <option value="it">Italiano (Italian)</option>
606
+ <option value="pt">Português (Portuguese)</option>
607
+ <option value="ru">Русский (Russian)</option>
608
+ <option value="ja">日本語 (Japanese)</option>
609
+ <option value="zh">中文 (Chinese)</option>
610
+ <option value="ar">العربية (Arabic)</option>
611
+ <option value="hi">हिन्दी (Hindi)</option>
612
+ <option value="nl">Nederlands (Dutch)</option>
613
+ <option value="pl">Polski (Polish)</option>
614
+ <option value="tr">Türkçe (Turkish)</option>
615
+ <option value="vi">Tiếng Việt (Vietnamese)</option>
616
+ <option value="th">ไทย (Thai)</option>
617
+ <option value="id">Bahasa Indonesia</option>
618
+ <option value="sv">Svenska (Swedish)</option>
619
+ <option value="da">Dansk (Danish)</option>
620
+ <option value="no">Norsk (Norwegian)</option>
621
+ <option value="fi">Suomi (Finnish)</option>
622
+ <option value="he">עברית (Hebrew)</option>
623
+ <option value="uk">Українська (Ukrainian)</option>
624
+ <option value="cs">Čeština (Czech)</option>
625
+ <option value="el">Ελληνικά (Greek)</option>
626
+ <option value="ro">Română (Romanian)</option>
627
+ <option value="hu">Magyar (Hungarian)</option>
628
+ <option value="ms">Bahasa Melayu (Malay)</option>
629
+ </select>
630
+ </div>
631
+ </div>
632
+ <div class="interpretation-info" id="interpretation-info" style="display: none;">
633
+ <strong>통역 모드 안내:</strong><br>
634
+ • 음성으로 말하면 선택한 언어로 자동 통역됩니다<br>
635
+ • Whisper + GPT-4o-mini + TTS를 사용합니다<br>
636
+ • 말을 마치고 잠시 기다리면 통역이 시작됩니다
637
+ </div>
638
+ <div class="text-input-section">
639
+ <label for="system-prompt" class="setting-label">시스템 프롬프트:</label>
640
+ <textarea id="system-prompt" placeholder="AI 어시스턴트의 성격, 역할, 행동 방식을 정의하세요...">You are a helpful assistant. Respond in a friendly and professional manner.</textarea>
641
+ </div>
642
+ </div>
643
+
644
+ <div class="controls">
645
+ <button id="start-button">대화 시작</button>
646
+ </div>
647
+ </div>
648
+
649
+ <div class="chat-section">
650
+ <div class="chat-container">
651
+ <h3 style="margin: 0 0 15px 0; color: var(--primary-color);">대화</h3>
652
+ <div class="chat-messages" id="chat-messages"></div>
653
+ <div class="text-input-section" style="margin-top: 10px;">
654
+ <div style="display: flex; gap: 10px;">
655
+ <input type="text" id="text-input" placeholder="텍스트 메시지를 입력하세요..." style="flex-grow: 1;" />
656
+ <button id="send-button" style="display: none;">전송</button>
657
+ </div>
658
+ </div>
659
+ </div>
660
+ </div>
661
+ </div>
662
+ </div>
663
+ <audio id="audio-output"></audio>
664
+
665
+ <script>
666
+ let peerConnection;
667
+ let webrtc_id;
668
+ let webSearchEnabled = false;
669
+ let selectedLanguage = "";
670
+ let interpretationMode = false;
671
+ let interpretationLanguage = "";
672
+ let systemPrompt = "You are a helpful assistant. Respond in a friendly and professional manner.";
673
+ const audioOutput = document.getElementById('audio-output');
674
+ const startButton = document.getElementById('start-button');
675
+ const sendButton = document.getElementById('send-button');
676
+ const chatMessages = document.getElementById('chat-messages');
677
+ const statusDot = document.getElementById('status-dot');
678
+ const statusText = document.getElementById('status-text');
679
+ const searchToggle = document.getElementById('search-toggle');
680
+ const languageSelect = document.getElementById('language-select');
681
+ const interpretationToggle = document.getElementById('interpretation-toggle');
682
+ const interpretationLanguageSelect = document.getElementById('interpretation-language-select');
683
+ const interpretationLanguageContainer = document.getElementById('interpretation-language-container');
684
+ const interpretationInfo = document.getElementById('interpretation-info');
685
+ const systemPromptInput = document.getElementById('system-prompt');
686
+ const textInput = document.getElementById('text-input');
687
+ let audioLevel = 0;
688
+ let animationFrame;
689
+ let audioContext, analyser, audioSource;
690
+ let dataChannel = null;
691
+ let isVoiceActive = false;
692
+
693
+ // Web search toggle functionality
694
+ searchToggle.addEventListener('click', () => {
695
+ webSearchEnabled = !webSearchEnabled;
696
+ searchToggle.classList.toggle('active', webSearchEnabled);
697
+ console.log('Web search enabled:', webSearchEnabled);
698
+ });
699
+
700
+ // Language selection
701
+ languageSelect.addEventListener('change', () => {
702
+ selectedLanguage = languageSelect.value;
703
+ console.log('Selected language:', selectedLanguage);
704
+ });
705
+
706
+ // Interpretation mode toggle
707
+ interpretationToggle.addEventListener('click', () => {
708
+ if (!interpretationMode) {
709
+ // Turning ON interpretation mode
710
+ interpretationLanguageContainer.style.display = 'flex';
711
+ interpretationInfo.style.display = 'block';
712
+
713
+ // Show language selector first
714
+ showError('통역 언어를 선택해주세요.');
715
+ interpretationToggle.classList.remove('active');
716
+
717
+ // Don't actually enable interpretation mode until language is selected
718
+ return;
719
+ } else {
720
+ // Turning OFF interpretation mode
721
+ interpretationMode = false;
722
+ interpretationToggle.classList.remove('active');
723
+ interpretationLanguageContainer.style.display = 'none';
724
+ interpretationInfo.style.display = 'none';
725
+ interpretationLanguage = '';
726
+ interpretationLanguageSelect.value = '';
727
+
728
+ // Re-enable other features
729
+ languageSelect.disabled = false;
730
+ searchToggle.style.opacity = '1';
731
+ searchToggle.style.pointerEvents = 'auto';
732
+ textInput.disabled = false;
733
+ textInput.placeholder = '텍스트 메시지를 입력하세요...';
734
+ sendButton.style.display = 'block';
735
+
736
+ console.log('Interpretation mode disabled');
737
+
738
+ // If connected, restart to apply normal mode
739
+ if (peerConnection && peerConnection.connectionState === 'connected') {
740
+ showError('일반 모드로 전환하기 위해 연결을 다시 시작합니다.');
741
+ stop();
742
+ setTimeout(() => {
743
+ setupWebRTC();
744
+ }, 500);
745
+ }
746
+ }
747
+
748
+ console.log('Interpretation mode:', interpretationMode);
749
+ });
750
+
751
+ // Interpretation language selection
752
+ interpretationLanguageSelect.addEventListener('change', () => {
753
+ interpretationLanguage = interpretationLanguageSelect.value;
754
+ console.log('Interpretation language:', interpretationLanguage);
755
+
756
+ if (interpretationLanguage && !interpretationMode) {
757
+ // Now actually enable interpretation mode
758
+ interpretationMode = true;
759
+ interpretationToggle.classList.add('active');
760
+
761
+ // Disable other features
762
+ languageSelect.value = '';
763
+ selectedLanguage = '';
764
+ languageSelect.disabled = true;
765
+ searchToggle.classList.remove('active');
766
+ webSearchEnabled = false;
767
+ searchToggle.style.opacity = '0.5';
768
+ searchToggle.style.pointerEvents = 'none';
769
+ textInput.disabled = true;
770
+ textInput.placeholder = '통역 모드에서는 텍스트 입력이 지원되지 않습니다';
771
+ sendButton.style.display = 'none';
772
+
773
+ console.log('Interpretation mode enabled with language:', interpretationLanguage);
774
+
775
+ // If already connected, restart the connection with new settings
776
+ if (peerConnection && peerConnection.connectionState === 'connected') {
777
+ showError('통역 모드 설정을 적용하기 위해 연결을 다시 시작합니다.');
778
+ stop();
779
+ setTimeout(() => {
780
+ setupWebRTC();
781
+ }, 500);
782
+ }
783
+ }
784
+ });
785
+
786
+ // System prompt update
787
+ systemPromptInput.addEventListener('input', () => {
788
+ systemPrompt = systemPromptInput.value || "You are a helpful assistant. Respond in a friendly and professional manner.";
789
+ });
790
+
791
+ // Text input handling
792
+ textInput.addEventListener('keypress', (e) => {
793
+ if (e.key === 'Enter' && !e.shiftKey) {
794
+ e.preventDefault();
795
+ sendTextMessage();
796
+ }
797
+ });
798
+
799
+ sendButton.addEventListener('click', sendTextMessage);
800
+
801
+ async function sendTextMessage() {
802
+ const message = textInput.value.trim();
803
+ if (!message) return;
804
+
805
+ // Don't allow text messages in interpretation mode
806
+ if (interpretationMode) {
807
+ showError('통역 모드에서는 텍스트 입력이 지원되지 않습니다.');
808
+ return;
809
+ }
810
+
811
+ // Add user message to chat
812
+ addMessage('user', message);
813
+ textInput.value = '';
814
+
815
+ // Show sending indicator
816
+ const typingIndicator = document.createElement('div');
817
+ typingIndicator.classList.add('message', 'assistant');
818
+ typingIndicator.textContent = '입력 중...';
819
+ typingIndicator.id = 'typing-indicator';
820
+ chatMessages.appendChild(typingIndicator);
821
+ chatMessages.scrollTop = chatMessages.scrollHeight;
822
+
823
+ try {
824
+ // Send to text chat endpoint
825
+ const response = await fetch('/chat/text', {
826
+ method: 'POST',
827
+ headers: { 'Content-Type': 'application/json' },
828
+ body: JSON.stringify({
829
+ message: message,
830
+ web_search_enabled: webSearchEnabled,
831
+ target_language: selectedLanguage,
832
+ system_prompt: systemPrompt
833
+ })
834
+ });
835
+
836
+ const data = await response.json();
837
+
838
+ // Remove typing indicator
839
+ const indicator = document.getElementById('typing-indicator');
840
+ if (indicator) indicator.remove();
841
+
842
+ if (data.error) {
843
+ showError(data.error);
844
+ } else {
845
+ // Add assistant response
846
+ let content = data.response;
847
+ if (selectedLanguage && data.language) {
848
+ content += ` <span class="language-info">[${data.language}]</span>`;
849
+ }
850
+ addMessage('assistant', content);
851
+ }
852
+ } catch (error) {
853
+ console.error('Error sending text message:', error);
854
+ const indicator = document.getElementById('typing-indicator');
855
+ if (indicator) indicator.remove();
856
+ showError('메시지 전송 중 오류가 발생했습니다.');
857
+ }
858
+ }
859
+
860
+ function updateStatus(state) {
861
+ statusDot.className = 'status-dot ' + state;
862
+ if (state === 'connected') {
863
+ statusText.textContent = '연결됨';
864
+ if (!interpretationMode) {
865
+ sendButton.style.display = 'block';
866
+ }
867
+ isVoiceActive = true;
868
+ } else if (state === 'connecting') {
869
+ statusText.textContent = '연결 중...';
870
+ sendButton.style.display = 'none';
871
+ } else {
872
+ statusText.textContent = '연결 대기 중';
873
+ if (!interpretationMode) {
874
+ sendButton.style.display = 'block'; // Show send button even when disconnected for text chat
875
+ }
876
+ isVoiceActive = false;
877
+ }
878
+ }
879
+ function updateButtonState() {
880
+ const button = document.getElementById('start-button');
881
+ if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
882
+ button.innerHTML = `
883
+ <div class="icon-with-spinner">
884
+ <div class="spinner"></div>
885
+ <span>연결 중...</span>
886
+ </div>
887
+ `;
888
+ updateStatus('connecting');
889
+ } else if (peerConnection && peerConnection.connectionState === 'connected') {
890
+ button.innerHTML = `
891
+ <div class="icon-with-spinner">
892
+ <div class="audio-visualizer" id="audio-visualizer">
893
+ <div class="visualizer-bar"></div>
894
+ <div class="visualizer-bar"></div>
895
+ <div class="visualizer-bar"></div>
896
+ <div class="visualizer-bar"></div>
897
+ <div class="visualizer-bar"></div>
898
+ </div>
899
+ <span>대화 종료</span>
900
+ </div>
901
+ `;
902
+ updateStatus('connected');
903
+ } else {
904
+ button.innerHTML = '대화 시작';
905
+ updateStatus('disconnected');
906
+ }
907
+ }
908
+ function setupAudioVisualization(stream) {
909
+ audioContext = new (window.AudioContext || window.webkitAudioContext)();
910
+ analyser = audioContext.createAnalyser();
911
+ audioSource = audioContext.createMediaStreamSource(stream);
912
+ audioSource.connect(analyser);
913
+ analyser.fftSize = 256;
914
+ const bufferLength = analyser.frequencyBinCount;
915
+ const dataArray = new Uint8Array(bufferLength);
916
+
917
+ const visualizerBars = document.querySelectorAll('.visualizer-bar');
918
+ const barCount = visualizerBars.length;
919
+
920
+ function updateAudioLevel() {
921
+ analyser.getByteFrequencyData(dataArray);
922
+
923
+ for (let i = 0; i < barCount; i++) {
924
+ const start = Math.floor(i * (bufferLength / barCount));
925
+ const end = Math.floor((i + 1) * (bufferLength / barCount));
926
+
927
+ let sum = 0;
928
+ for (let j = start; j < end; j++) {
929
+ sum += dataArray[j];
930
+ }
931
+
932
+ const average = sum / (end - start) / 255;
933
+ const scaleY = 0.1 + average * 0.9;
934
+ visualizerBars[i].style.transform = `scaleY(${scaleY})`;
935
+ }
936
+
937
+ animationFrame = requestAnimationFrame(updateAudioLevel);
938
+ }
939
+
940
+ updateAudioLevel();
941
+ }
942
+ function showError(message) {
943
+ const toast = document.getElementById('error-toast');
944
+ toast.textContent = message;
945
+ toast.className = 'toast error';
946
+ toast.style.display = 'block';
947
+ setTimeout(() => {
948
+ toast.style.display = 'none';
949
+ }, 5000);
950
+ }
951
+ async function setupWebRTC() {
952
+ const config = __RTC_CONFIGURATION__;
953
+ peerConnection = new RTCPeerConnection(config);
954
+ const timeoutId = setTimeout(() => {
955
+ const toast = document.getElementById('error-toast');
956
+ toast.textContent = "연결이 평소보다 오래 걸리고 있습니다. VPN을 사용 중이신가요?";
957
+ toast.className = 'toast warning';
958
+ toast.style.display = 'block';
959
+ setTimeout(() => {
960
+ toast.style.display = 'none';
961
+ }, 5000);
962
+ }, 5000);
963
+ try {
964
+ const stream = await navigator.mediaDevices.getUserMedia({
965
+ audio: true
966
+ });
967
+ setupAudioVisualization(stream);
968
+ stream.getTracks().forEach(track => {
969
+ peerConnection.addTrack(track, stream);
970
+ });
971
+ peerConnection.addEventListener('track', (evt) => {
972
+ if (audioOutput.srcObject !== evt.streams[0]) {
973
+ audioOutput.srcObject = evt.streams[0];
974
+ audioOutput.play();
975
+ }
976
+ });
977
+
978
+ // Create data channel for text messages
979
+ dataChannel = peerConnection.createDataChannel('text');
980
+ dataChannel.onopen = () => {
981
+ console.log('Data channel opened');
982
+ };
983
+ dataChannel.onmessage = (event) => {
984
+ const eventJson = JSON.parse(event.data);
985
+ if (eventJson.type === "error") {
986
+ showError(eventJson.message);
987
+ }
988
+ };
989
+
990
+ const offer = await peerConnection.createOffer();
991
+ await peerConnection.setLocalDescription(offer);
992
+ await new Promise((resolve) => {
993
+ if (peerConnection.iceGatheringState === "complete") {
994
+ resolve();
995
+ } else {
996
+ const checkState = () => {
997
+ if (peerConnection.iceGatheringState === "complete") {
998
+ peerConnection.removeEventListener("icegatheringstatechange", checkState);
999
+ resolve();
1000
+ }
1001
+ };
1002
+ peerConnection.addEventListener("icegatheringstatechange", checkState);
1003
+ }
1004
+ });
1005
+ peerConnection.addEventListener('connectionstatechange', () => {
1006
+ console.log('connectionstatechange', peerConnection.connectionState);
1007
+ if (peerConnection.connectionState === 'connected') {
1008
+ clearTimeout(timeoutId);
1009
+ const toast = document.getElementById('error-toast');
1010
+ toast.style.display = 'none';
1011
+ }
1012
+ updateButtonState();
1013
+ });
1014
+ webrtc_id = Math.random().toString(36).substring(7);
1015
+
1016
+ // Log current settings before sending
1017
+ console.log('Sending offer with settings:', {
1018
+ webrtc_id: webrtc_id,
1019
+ web_search_enabled: webSearchEnabled,
1020
+ target_language: selectedLanguage,
1021
+ system_prompt: systemPrompt,
1022
+ interpretation_mode: interpretationMode,
1023
+ interpretation_language: interpretationLanguage
1024
+ });
1025
+
1026
+ const response = await fetch('/webrtc/offer', {
1027
+ method: 'POST',
1028
+ headers: { 'Content-Type': 'application/json' },
1029
+ body: JSON.stringify({
1030
+ sdp: peerConnection.localDescription.sdp,
1031
+ type: peerConnection.localDescription.type,
1032
+ webrtc_id: webrtc_id,
1033
+ web_search_enabled: webSearchEnabled,
1034
+ target_language: selectedLanguage,
1035
+ system_prompt: systemPrompt,
1036
+ interpretation_mode: interpretationMode,
1037
+ interpretation_language: interpretationLanguage
1038
+ })
1039
+ });
1040
+ const serverResponse = await response.json();
1041
+ if (serverResponse.status === 'failed') {
1042
+ showError(serverResponse.meta.error === 'concurrency_limit_reached'
1043
+ ? `너무 많은 연결입니다. 최대 한도는 ${serverResponse.meta.limit} 입니다.`
1044
+ : serverResponse.meta.error);
1045
+ stop();
1046
+ return;
1047
+ }
1048
+ await peerConnection.setRemoteDescription(serverResponse);
1049
+ const eventSource = new EventSource('/outputs?webrtc_id=' + webrtc_id);
1050
+ eventSource.addEventListener("output", (event) => {
1051
+ const eventJson = JSON.parse(event.data);
1052
+ let content = eventJson.content;
1053
+
1054
+ // Debug logging for interpretation mode
1055
+ if (interpretationMode) {
1056
+ console.log('[INTERPRETATION OUTPUT]', {
1057
+ content: content,
1058
+ language: eventJson.language,
1059
+ mode: eventJson.mode,
1060
+ expectedLanguage: interpretationLanguage
1061
+ });
1062
+ }
1063
+
1064
+ if (selectedLanguage && eventJson.language) {
1065
+ content += ` <span class="language-info">[${eventJson.language}]</span>`;
1066
+ } else if (interpretationMode && eventJson.language) {
1067
+ // In interpretation mode, show the translation process
1068
+ if (content.includes('→')) {
1069
+ // Format: "Korean text → English text"
1070
+ const parts = content.split('→');
1071
+ if (parts.length === 2) {
1072
+ content = `<span style="color: #999;">${parts[0].trim()}</span>` +
1073
+ `<span class="interpretation-arrow">→</span>` +
1074
+ `<strong>${parts[1].trim()}</strong>`;
1075
+ }
1076
+ }
1077
+ content += ` <span class="language-info">[통역: ${eventJson.language}]</span>`;
1078
+ }
1079
+ addMessage("assistant", content);
1080
+ });
1081
+ eventSource.addEventListener("search", (event) => {
1082
+ const eventJson = JSON.parse(event.data);
1083
+ if (eventJson.query) {
1084
+ addMessage("search-result", `웹 검색 중: "${eventJson.query}"`);
1085
+ }
1086
+ });
1087
+ } catch (err) {
1088
+ clearTimeout(timeoutId);
1089
+ console.error('Error setting up WebRTC:', err);
1090
+ showError('연결을 설정하지 못했습니다. 다시 시도해 주세요.');
1091
+ stop();
1092
+ }
1093
+ }
1094
+ function addMessage(role, content) {
1095
+ const messageDiv = document.createElement('div');
1096
+ messageDiv.classList.add('message', role);
1097
+
1098
+ // Check if it's an interpretation message
1099
+ if (interpretationMode && role === 'assistant' && content.includes('→')) {
1100
+ messageDiv.classList.add('interpretation');
1101
+ }
1102
+
1103
+ if (content.includes('<span')) {
1104
+ messageDiv.innerHTML = content;
1105
+ } else {
1106
+ messageDiv.textContent = content;
1107
+ }
1108
+ chatMessages.appendChild(messageDiv);
1109
+ chatMessages.scrollTop = chatMessages.scrollHeight;
1110
+ }
1111
+ function stop() {
1112
+ if (animationFrame) {
1113
+ cancelAnimationFrame(animationFrame);
1114
+ }
1115
+ if (audioContext) {
1116
+ audioContext.close();
1117
+ audioContext = null;
1118
+ analyser = null;
1119
+ audioSource = null;
1120
+ }
1121
+ if (peerConnection) {
1122
+ if (peerConnection.getTransceivers) {
1123
+ peerConnection.getTransceivers().forEach(transceiver => {
1124
+ if (transceiver.stop) {
1125
+ transceiver.stop();
1126
+ }
1127
+ });
1128
+ }
1129
+ if (peerConnection.getSenders) {
1130
+ peerConnection.getSenders().forEach(sender => {
1131
+ if (sender.track && sender.track.stop) sender.track.stop();
1132
+ });
1133
+ }
1134
+ console.log('closing');
1135
+ peerConnection.close();
1136
+ }
1137
+ dataChannel = null;
1138
+ updateButtonState();
1139
+ audioLevel = 0;
1140
+ }
1141
+ startButton.addEventListener('click', () => {
1142
+ console.log('clicked');
1143
+ console.log(peerConnection, peerConnection?.connectionState);
1144
+ if (!peerConnection || peerConnection.connectionState !== 'connected') {
1145
+ setupWebRTC();
1146
+ } else {
1147
+ console.log('stopping');
1148
+ stop();
1149
+ }
1150
+ });
1151
+
1152
+ // Initialize send button visibility on page load
1153
+ window.addEventListener('DOMContentLoaded', () => {
1154
+ sendButton.style.display = 'block';
1155
+ });
1156
+ </script>
1157
+ </body>
1158
+
1159
+ </html>"""
1160
+
1161
+
1162
+ class BraveSearchClient:
1163
+ """Brave Search API client"""
1164
+ def __init__(self, api_key: str):
1165
+ self.api_key = api_key
1166
+ self.base_url = "https://api.search.brave.com/res/v1/web/search"
1167
+
1168
+ async def search(self, query: str, count: int = 10) -> List[Dict]:
1169
+ """Perform a web search using Brave Search API"""
1170
+ if not self.api_key:
1171
+ return []
1172
+
1173
+ headers = {
1174
+ "Accept": "application/json",
1175
+ "X-Subscription-Token": self.api_key
1176
+ }
1177
+ params = {
1178
+ "q": query,
1179
+ "count": count,
1180
+ "lang": "ko"
1181
+ }
1182
+
1183
+ async with httpx.AsyncClient() as client:
1184
+ try:
1185
+ response = await client.get(self.base_url, headers=headers, params=params)
1186
+ response.raise_for_status()
1187
+ data = response.json()
1188
+
1189
+ results = []
1190
+ if "web" in data and "results" in data["web"]:
1191
+ for result in data["web"]["results"][:count]:
1192
+ results.append({
1193
+ "title": result.get("title", ""),
1194
+ "url": result.get("url", ""),
1195
+ "description": result.get("description", "")
1196
+ })
1197
+ return results
1198
+ except Exception as e:
1199
+ print(f"Brave Search error: {e}")
1200
+ return []
1201
+
1202
+
1203
+ # Initialize search client globally
1204
+ brave_api_key = os.getenv("BSEARCH_API")
1205
+ search_client = BraveSearchClient(brave_api_key) if brave_api_key else None
1206
+ print(f"Search client initialized: {search_client is not None}, API key present: {bool(brave_api_key)}")
1207
+
1208
+ # Store connection settings
1209
+ connection_settings = {}
1210
+
1211
+ # Initialize OpenAI client for text chat
1212
+ client = openai.AsyncOpenAI()
1213
+
1214
+ def get_translation_instructions(target_language: str) -> str:
1215
+ """Get instructions for translation based on target language"""
1216
+ if not target_language:
1217
+ return ""
1218
+
1219
+ language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
1220
+ return (
1221
+ f"\n\nIMPORTANT: You must respond in {language_name} ({target_language}). "
1222
+ f"Translate all your responses to {language_name}."
1223
+ )
1224
+
1225
+ def update_chatbot(chatbot: list[dict], response: ResponseAudioTranscriptDoneEvent):
1226
+ chatbot.append({"role": "assistant", "content": response.transcript})
1227
+ return chatbot
1228
+
1229
+
1230
+ def get_translation_instructions(target_language: str) -> str:
1231
+ """Get instructions for translation based on target language"""
1232
+ if not target_language:
1233
+ return ""
1234
+
1235
+ language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
1236
+ return (
1237
+ f"\n\nIMPORTANT: You must respond in {language_name} ({target_language}). "
1238
+ f"Translate all your responses to {language_name}."
1239
+ )
1240
+
1241
+
1242
+ async def process_text_chat(message: str, web_search_enabled: bool, target_language: str,
1243
+ system_prompt: str) -> Dict[str, str]:
1244
+ """Process text chat using GPT-4o-mini model"""
1245
+ try:
1246
+ # If target language is set, override system prompt completely
1247
+ if target_language:
1248
+ language_name = SUPPORTED_LANGUAGES.get(target_language, target_language)
1249
+
1250
+ # Create system prompt in target language
1251
+ if target_language == "en":
1252
+ base_instructions = f"You are a helpful assistant. You speak ONLY English. Never use Korean or any other language. {system_prompt}"
1253
+ user_prefix = "Please respond in English: "
1254
+ elif target_language == "ja":
1255
+ base_instructions = f"あなたは親切なアシスタントです。日本語のみを話します。韓国語や他の言語は絶対に使用しません。{system_prompt}"
1256
+ user_prefix = "日本語で答えてください: "
1257
+ elif target_language == "zh":
1258
+ base_instructions = f"你是一个乐于助人的助手。你只说中文。绝不使用韩语或其他语言。{system_prompt}"
1259
+ user_prefix = "请用中文回答: "
1260
+ elif target_language == "es":
1261
+ base_instructions = f"Eres un asistente útil. Solo hablas español. Nunca uses coreano u otros idiomas. {system_prompt}"
1262
+ user_prefix = "Por favor responde en español: "
1263
+ else:
1264
+ base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}. {system_prompt}"
1265
+ user_prefix = f"Please respond in {language_name}: "
1266
+ else:
1267
+ base_instructions = system_prompt or "You are a helpful assistant."
1268
+ user_prefix = ""
1269
+
1270
+ messages = [
1271
+ {"role": "system", "content": base_instructions}
1272
+ ]
1273
+
1274
+ # Handle web search if enabled
1275
+ if web_search_enabled and search_client:
1276
+ # Check if the message requires web search
1277
+ search_keywords = ["날씨", "기온", "비", "눈", "뉴스", "소식", "현재", "최근",
1278
+ "오늘", "지금", "가격", "환율", "주가", "weather", "news",
1279
+ "current", "today", "price", "2024", "2025"]
1280
+
1281
+ should_search = any(keyword in message.lower() for keyword in search_keywords)
1282
+
1283
+ if should_search:
1284
+ # Perform web search
1285
+ search_results = await search_client.search(message)
1286
+ if search_results:
1287
+ search_context = "웹 검색 결과:\n\n"
1288
+ for i, result in enumerate(search_results[:5], 1):
1289
+ search_context += f"{i}. {result['title']}\n{result['description']}\n\n"
1290
+
1291
+ # Add search context in target language if set
1292
+ if target_language:
1293
+ search_instruction = f"Use this search information but respond in {SUPPORTED_LANGUAGES.get(target_language, target_language)} only: "
1294
+ else:
1295
+ search_instruction = "다음 웹 검색 결과를 참고하여 답변하세요: "
1296
+
1297
+ messages.append({
1298
+ "role": "system",
1299
+ "content": search_instruction + "\n\n" + search_context
1300
+ })
1301
+
1302
+ # Add user message with language prefix
1303
+ messages.append({"role": "user", "content": user_prefix + message})
1304
+
1305
+ # Call GPT-4o-mini
1306
+ response = await client.chat.completions.create(
1307
+ model="gpt-4o-mini",
1308
+ messages=messages,
1309
+ temperature=0.7,
1310
+ max_tokens=2000
1311
+ )
1312
+
1313
+ response_text = response.choices[0].message.content
1314
+
1315
+ # Final check - remove any Korean if target language is not Korean
1316
+ if target_language and target_language != "ko":
1317
+ import re
1318
+ if re.search(r'[가-힣]', response_text):
1319
+ print(f"[TEXT CHAT] WARNING: Korean detected in response for {target_language}")
1320
+ # Try again with stronger prompt
1321
+ messages[-1] = {"role": "user", "content": f"ONLY {SUPPORTED_LANGUAGES.get(target_language, target_language)}, NO KOREAN: {message}"}
1322
+ retry_response = await client.chat.completions.create(
1323
+ model="gpt-4o-mini",
1324
+ messages=messages,
1325
+ temperature=0.3,
1326
+ max_tokens=2000
1327
+ )
1328
+ response_text = retry_response.choices[0].message.content
1329
+
1330
+ print(f"[TEXT CHAT] Target language: {target_language}")
1331
+ print(f"[TEXT CHAT] Response preview: {response_text[:100]}...")
1332
+
1333
+ return {
1334
+ "response": response_text,
1335
+ "language": SUPPORTED_LANGUAGES.get(target_language, "") if target_language else ""
1336
+ }
1337
+
1338
+ except Exception as e:
1339
+ print(f"Error in text chat: {e}")
1340
+ return {"error": str(e)}
1341
+
1342
+
1343
+ class OpenAIHandler(AsyncStreamHandler):
1344
+ def __init__(self, web_search_enabled: bool = False, target_language: str = "",
1345
+ system_prompt: str = "", webrtc_id: str = None,
1346
+ interpretation_mode: bool = False, interpretation_language: str = "") -> None:
1347
+ super().__init__(
1348
+ expected_layout="mono",
1349
+ output_sample_rate=SAMPLE_RATE,
1350
+ output_frame_size=480,
1351
+ input_sample_rate=SAMPLE_RATE,
1352
+ )
1353
+ self.connection = None
1354
+ self.output_queue = asyncio.Queue()
1355
+ self.search_client = search_client
1356
+ self.function_call_in_progress = False
1357
+ self.current_function_args = ""
1358
+ self.current_call_id = None
1359
+ self.webrtc_id = webrtc_id
1360
+ self.web_search_enabled = web_search_enabled
1361
+ self.target_language = target_language
1362
+ self.system_prompt = system_prompt
1363
+ self.interpretation_mode = interpretation_mode
1364
+ self.interpretation_language = interpretation_language
1365
+
1366
+ # For interpretation mode
1367
+ self.audio_buffer = []
1368
+ self.is_recording = False
1369
+ self.silence_frames = 0
1370
+ self.silence_threshold = 20 # Reduced for faster response (20 frames = ~0.4 seconds)
1371
+ self.min_audio_length = 10 # Minimum frames to consider as speech
1372
+
1373
+ print(f"Handler created with web_search_enabled={web_search_enabled}, "
1374
+ f"target_language={target_language}, webrtc_id={webrtc_id}, "
1375
+ f"interpretation_mode={interpretation_mode}, interpretation_language={interpretation_language}")
1376
+
1377
+ def copy(self):
1378
+ # Get the most recent settings
1379
+ if connection_settings:
1380
+ # Get the most recent webrtc_id
1381
+ recent_ids = sorted(connection_settings.keys(),
1382
+ key=lambda k: connection_settings[k].get('timestamp', 0),
1383
+ reverse=True)
1384
+ if recent_ids:
1385
+ recent_id = recent_ids[0]
1386
+ settings = connection_settings[recent_id]
1387
+ return OpenAIHandler(
1388
+ web_search_enabled=settings.get('web_search_enabled', False),
1389
+ target_language=settings.get('target_language', ''),
1390
+ system_prompt=settings.get('system_prompt', ''),
1391
+ webrtc_id=recent_id,
1392
+ interpretation_mode=settings.get('interpretation_mode', False),
1393
+ interpretation_language=settings.get('interpretation_language', '')
1394
+ )
1395
+
1396
+ print(f"Handler.copy() called - creating new handler with default settings")
1397
+ return OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
1398
+
1399
+ async def search_web(self, query: str) -> str:
1400
+ """Perform web search and return formatted results"""
1401
+ if not self.search_client or not self.web_search_enabled:
1402
+ return "웹 검색이 비활성화되어 있습니다."
1403
+
1404
+ print(f"Searching web for: {query}")
1405
+ results = await self.search_client.search(query)
1406
+ if not results:
1407
+ return f"'{query}'에 대한 검색 결과를 찾을 수 없습니다."
1408
+
1409
+ # Format search results
1410
+ formatted_results = []
1411
+ for i, result in enumerate(results, 1):
1412
+ formatted_results.append(
1413
+ f"{i}. {result['title']}\n"
1414
+ f" URL: {result['url']}\n"
1415
+ f" {result['description']}\n"
1416
+ )
1417
+
1418
+ return f"웹 검색 결과 '{query}':\n\n" + "\n".join(formatted_results)
1419
+
1420
+ async def process_text_message(self, message: str):
1421
+ """Process text message from user"""
1422
+ if self.connection:
1423
+ await self.connection.conversation.item.create(
1424
+ item={
1425
+ "type": "message",
1426
+ "role": "user",
1427
+ "content": [{"type": "input_text", "text": message}]
1428
+ }
1429
+ )
1430
+ await self.connection.response.create()
1431
+
1432
+ async def process_interpretation(self):
1433
+ """Process audio buffer for interpretation"""
1434
+ if not self.audio_buffer or not self.interpretation_language:
1435
+ return
1436
+
1437
+ try:
1438
+ print(f"[INTERPRETATION] Processing audio buffer with {len(self.audio_buffer)} frames")
1439
+
1440
+ # Convert audio buffer to WAV format
1441
+ audio_data = np.concatenate(self.audio_buffer)
1442
+
1443
+ # Create WAV file in memory
1444
+ wav_buffer = io.BytesIO()
1445
+ with wave.open(wav_buffer, 'wb') as wav_file:
1446
+ wav_file.setnchannels(1) # Mono
1447
+ wav_file.setsampwidth(2) # 16-bit
1448
+ wav_file.setframerate(SAMPLE_RATE)
1449
+ wav_file.writeframes(audio_data.tobytes())
1450
+
1451
+ wav_buffer.seek(0)
1452
+ wav_buffer.name = "audio.wav"
1453
+
1454
+ # 1. Transcribe with Whisper
1455
+ print("[INTERPRETATION] Transcribing with Whisper...")
1456
+ transcript = await self.client.audio.transcriptions.create(
1457
+ model="whisper-1",
1458
+ file=wav_buffer,
1459
+ language="ko" # Assuming Korean input
1460
+ )
1461
+
1462
+ user_text = transcript.text.strip()
1463
+ print(f"[INTERPRETATION] Transcribed: {user_text}")
1464
+
1465
+ if not user_text:
1466
+ return
1467
+
1468
+ # 2. Translate with GPT-4o-mini
1469
+ target_lang_name = SUPPORTED_LANGUAGES.get(self.interpretation_language, self.interpretation_language)
1470
+
1471
+ # Create very explicit translation examples
1472
+ translation_examples = {
1473
+ "en": {
1474
+ "안녕하세요": "Hello",
1475
+ "감사합니다": "Thank you",
1476
+ "오늘 날씨가 좋네요": "The weather is nice today"
1477
+ },
1478
+ "ja": {
1479
+ "안녕하세요": "こんにちは",
1480
+ "감사합니다": "ありがとうございます",
1481
+ "오늘 날씨가 좋네요": "今日はいい天気ですね"
1482
+ },
1483
+ "zh": {
1484
+ "안녕하세요": "你好",
1485
+ "감사합니다": "谢谢",
1486
+ "오늘 날씨가 좋네요": "今天天气很好"
1487
+ },
1488
+ "es": {
1489
+ "안녕하세요": "Hola",
1490
+ "감사합니다": "Gracias",
1491
+ "오늘 날씨가 좋네요": "El clima está agradable hoy"
1492
+ }
1493
+ }
1494
+
1495
+ examples = translation_examples.get(self.interpretation_language, translation_examples["en"])
1496
+ examples_text = "\n".join([f'"{k}" → "{v}"' for k, v in examples.items()])
1497
+
1498
+ # Ultra-specific prompt
1499
+ system_prompt = f"""You are a Korean to {target_lang_name} translator.
1500
+
1501
+ STRICT RULES:
1502
+ 1. Output ONLY the {target_lang_name} translation
1503
+ 2. Do NOT output Korean
1504
+ 3. Do NOT add explanations
1505
+ 4. Do NOT answer questions
1506
+ 5. Just translate
1507
+
1508
+ Examples:
1509
+ {examples_text}
1510
+
1511
+ Now translate the Korean text to {target_lang_name}. Output ONLY the translation in {target_lang_name}:"""
1512
+
1513
+ print(f"[INTERPRETATION] Translating to {target_lang_name}...")
1514
+ print(f"[INTERPRETATION] System prompt: {system_prompt}")
1515
+
1516
+ translation_response = await self.client.chat.completions.create(
1517
+ model="gpt-4o-mini",
1518
+ messages=[
1519
+ {
1520
+ "role": "system",
1521
+ "content": system_prompt
1522
+ },
1523
+ {
1524
+ "role": "user",
1525
+ "content": f"Translate this Korean to {target_lang_name}: {user_text}"
1526
+ }
1527
+ ],
1528
+ temperature=0.1, # Very low temperature
1529
+ max_tokens=200
1530
+ )
1531
+
1532
+ translated_text = translation_response.choices[0].message.content.strip()
1533
+
1534
+ # Remove any Korean characters if they accidentally appear
1535
+ import re
1536
+ if re.search(r'[가-힣]', translated_text):
1537
+ print(f"[INTERPRETATION] WARNING: Korean characters detected in translation: {translated_text}")
1538
+ # Try to extract only non-Korean parts
1539
+ translated_text = re.sub(r'[가-힣\s]+', ' ', translated_text).strip()
1540
+
1541
+ print(f"[INTERPRETATION] Translated: {translated_text}")
1542
+
1543
+ # 3. Generate speech with TTS
1544
+ print(f"[INTERPRETATION] Generating speech for text: {translated_text}")
1545
+
1546
+ # Select appropriate voice and ensure it speaks the target language
1547
+ voice_map = {
1548
+ "en": "alloy", # Alloy is native English speaker
1549
+ "es": "nova", # Nova handles Spanish well
1550
+ "fr": "shimmer", # Shimmer handles French well
1551
+ "de": "echo", # Echo handles German well
1552
+ "ja": "nova", # Nova can handle Japanese
1553
+ "zh": "nova", # Nova can handle Chinese
1554
+ "ko": "nova", # Nova can handle Korean
1555
+ }
1556
+ selected_voice = voice_map.get(self.interpretation_language, "nova")
1557
+
1558
+ print(f"[INTERPRETATION] Using voice: {selected_voice} for language: {self.interpretation_language}")
1559
+
1560
+ # For some languages, we might need to add pronunciation hints
1561
+ if self.interpretation_language == "en" and re.search(r'[가-힣]', translated_text):
1562
+ print("[INTERPRETATION] ERROR: Korean characters in English translation!")
1563
+ translated_text = "Translation error occurred"
1564
+
1565
+ try:
1566
+ tts_response = await self.client.audio.speech.create(
1567
+ model="tts-1",
1568
+ voice=selected_voice,
1569
+ input=translated_text,
1570
+ response_format="pcm", # PCM format for direct playback
1571
+ speed=1.0
1572
+ )
1573
+ except Exception as tts_error:
1574
+ print(f"[INTERPRETATION] TTS Error: {tts_error}")
1575
+ # If TTS fails, try with a different voice
1576
+ tts_response = await self.client.audio.speech.create(
1577
+ model="tts-1",
1578
+ voice="alloy", # Fallback to alloy
1579
+ input=translated_text,
1580
+ response_format="pcm",
1581
+ speed=1.0
1582
+ )
1583
+
1584
+ # Convert response to bytes
1585
+ audio_bytes = b""
1586
+ async for chunk in tts_response.iter_bytes(1024):
1587
+ audio_bytes += chunk
1588
+
1589
+ # Convert PCM to numpy array (TTS outputs at 24kHz)
1590
+ audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
1591
+
1592
+ # Send audio in chunks
1593
+ if len(audio_array) > 0:
1594
+ # Split audio into chunks and send
1595
+ chunk_size = 480 # Match our frame size
1596
+ for i in range(0, len(audio_array), chunk_size):
1597
+ chunk = audio_array[i:i + chunk_size]
1598
+ if len(chunk) < chunk_size:
1599
+ # Pad the last chunk if necessary
1600
+ chunk = np.pad(chunk, (0, chunk_size - len(chunk)), 'constant')
1601
+
1602
+ await self.output_queue.put((SAMPLE_RATE, chunk.reshape(1, -1)))
1603
+
1604
+ # Send transcript event
1605
+ output_data = {
1606
+ "event": type('Event', (), {
1607
+ 'transcript': f"{user_text} → {translated_text}"
1608
+ })(),
1609
+ "language": target_lang_name,
1610
+ "mode": "interpretation"
1611
+ }
1612
+ await self.output_queue.put(AdditionalOutputs(output_data))
1613
+
1614
+ except Exception as e:
1615
+ print(f"[INTERPRETATION] Error: {e}")
1616
+ import traceback
1617
+ traceback.print_exc()
1618
+
1619
+ # Send error message to client
1620
+ error_data = {
1621
+ "event": type('Event', (), {
1622
+ 'transcript': f"통역 오류: {str(e)}"
1623
+ })(),
1624
+ "language": "",
1625
+ "mode": "error"
1626
+ }
1627
+ await self.output_queue.put(AdditionalOutputs(error_data))
1628
+ finally:
1629
+ # Clear the audio buffer
1630
+ self.audio_buffer = []
1631
+ self.is_recording = False
1632
+ self.silence_frames = 0
1633
+
1634
+ def get_translation_instructions(self):
1635
+ """Get instructions for translation based on target language"""
1636
+ if not self.target_language or self.interpretation_mode:
1637
+ return ""
1638
+
1639
+ language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
1640
+ return (
1641
+ f"\n\nIMPORTANT: You must respond in {language_name} ({self.target_language}). "
1642
+ f"Translate all your responses to {language_name}. "
1643
+ f"This includes both spoken and written responses."
1644
+ )
1645
+
1646
+ async def start_up(self):
1647
+ """Connect to realtime API or setup interpretation mode"""
1648
+ # First check if we have the most recent settings
1649
+ if connection_settings:
1650
+ recent_ids = sorted(connection_settings.keys(),
1651
+ key=lambda k: connection_settings[k].get('timestamp', 0),
1652
+ reverse=True)
1653
+ if recent_ids:
1654
+ recent_id = recent_ids[0]
1655
+ settings = connection_settings[recent_id]
1656
+ self.web_search_enabled = settings.get('web_search_enabled', False)
1657
+ self.target_language = settings.get('target_language', '')
1658
+ self.system_prompt = settings.get('system_prompt', '')
1659
+ self.interpretation_mode = settings.get('interpretation_mode', False)
1660
+ self.interpretation_language = settings.get('interpretation_language', '')
1661
+ self.webrtc_id = recent_id
1662
+ print(f"start_up: Updated settings from storage - webrtc_id={self.webrtc_id}, "
1663
+ f"web_search_enabled={self.web_search_enabled}, target_language={self.target_language}, "
1664
+ f"interpretation_mode={self.interpretation_mode}")
1665
+ print(f"Handler interpretation settings: mode={self.interpretation_mode}, language={self.interpretation_language}")
1666
+
1667
+ print(f"Starting up handler with web_search_enabled={self.web_search_enabled}, "
1668
+ f"target_language={self.target_language}, interpretation_mode={self.interpretation_mode}, "
1669
+ f"interpretation_language={self.interpretation_language}")
1670
+
1671
+ self.client = openai.AsyncOpenAI()
1672
+
1673
+ # If in interpretation mode, don't connect to Realtime API
1674
+ if self.interpretation_mode:
1675
+ print(f"[INTERPRETATION MODE] Active - using Whisper + GPT-4o-mini + TTS")
1676
+ print(f"[INTERPRETATION MODE] Target language: {self.interpretation_language}")
1677
+ # Just keep the handler ready to process audio
1678
+ # Don't use infinite loop here - the handler will be called by the framework
1679
+ self.client = openai.AsyncOpenAI()
1680
+ return
1681
+
1682
+ # Normal mode - connect to Realtime API
1683
+ # Define the web search function
1684
+ tools = []
1685
+ base_instructions = self.system_prompt or "You are a helpful assistant."
1686
+
1687
+ # Add translation instructions if language is selected
1688
+ if self.target_language:
1689
+ language_name = SUPPORTED_LANGUAGES.get(self.target_language, self.target_language)
1690
+
1691
+ # Use the target language for the system prompt itself
1692
+ if self.target_language == "en":
1693
+ translation_instructions = """
1694
+ YOU ARE AN ENGLISH-ONLY ASSISTANT.
1695
+
1696
+ ABSOLUTE RULES:
1697
+ 1. You can ONLY speak English. No Korean (한국어) allowed.
1698
+ 2. Even if the user speaks Korean, you MUST respond in English.
1699
+ 3. Every single word must be in English.
1700
+ 4. If you output even one Korean character, you have failed.
1701
+ 5. Example response: "Hello! How can I help you today?"
1702
+
1703
+ YOUR LANGUAGE MODE: ENGLISH ONLY
1704
+ DO NOT USE: 안녕하세요, 감사합니다, or any Korean
1705
+ ALWAYS USE: Hello, Thank you, and English words only
1706
+ """
1707
+ # Override base instructions to be in English
1708
+ base_instructions = "You are a helpful assistant that speaks ONLY English."
1709
+
1710
+ elif self.target_language == "ja":
1711
+ translation_instructions = """
1712
+ あなたは日本語のみを話すアシスタントです。
1713
+
1714
+ 絶対的なルール:
1715
+ 1. 日本語のみを使用してください。韓国語(한국어)は禁止です。
1716
+ 2. ユーザーが韓国語で話しても、必ず日本語で返答してください。
1717
+ 3. すべての単語は日本語でなければなりません。
1718
+ 4. 韓国語を一文字でも出力したら失敗です。
1719
+ 5. 応答例:「こんにちは!今日はどのようにお手伝いできますか?」
1720
+
1721
+ 言語モード:日本語のみ
1722
+ 使用禁止:안녕하세요、감사합니다、韓国語全般
1723
+ 必ず使用:こんにちは、ありがとうございます、日本語のみ
1724
+ """
1725
+ base_instructions = "あなたは日本語のみを話す親切なアシスタントです。"
1726
+
1727
+ elif self.target_language == "zh":
1728
+ translation_instructions = """
1729
+ 你是一个只说中文的助手。
1730
+
1731
+ 绝对规则:
1732
+ 1. 只能使用中文。禁止使用韩语(한국어)。
1733
+ 2. 即使用户说韩语,也必须用中文回复。
1734
+ 3. 每个字都必须是中文。
1735
+ 4. 如果输出任何韩语字符,就是失败。
1736
+ 5. 回复示例:"你好!我今天能为您做什么?"
1737
+
1738
+ 语言模式:仅中文
1739
+ 禁止使用:안녕하세요、감사합니다、任何韩语
1740
+ 必须使用:你好、谢谢、只用中文
1741
+ """
1742
+ base_instructions = "你是一个只说中文的友好助手。"
1743
+
1744
+ elif self.target_language == "es":
1745
+ translation_instructions = """
1746
+ ERES UN ASISTENTE QUE SOLO HABLA ESPAÑOL.
1747
+
1748
+ REGLAS ABSOLUTAS:
1749
+ 1. Solo puedes hablar español. No se permite coreano (한국어).
1750
+ 2. Incluso si el usuario habla coreano, DEBES responder en español.
1751
+ 3. Cada palabra debe estar en español.
1752
+ 4. Si produces aunque sea un carácter coreano, has fallado.
1753
+ 5. Respuesta ejemplo: "¡Hola! ¿Cómo puedo ayudarte hoy?"
1754
+
1755
+ MODO DE IDIOMA: SOLO ESPAÑOL
1756
+ NO USAR: 안녕하세요, 감사합니다, o cualquier coreano
1757
+ SIEMPRE USAR: Hola, Gracias, y solo palabras en español
1758
+ """
1759
+ base_instructions = "Eres un asistente útil que habla SOLO español."
1760
+ else:
1761
+ translation_instructions = f"""
1762
+ YOU MUST ONLY SPEAK {language_name.upper()}.
1763
+
1764
+ RULES:
1765
+ 1. Output only in {language_name}
1766
+ 2. Never use Korean
1767
+ 3. Always respond in {language_name}
1768
+ """
1769
+ base_instructions = f"You are a helpful assistant that speaks ONLY {language_name}."
1770
+ else:
1771
+ translation_instructions = ""
1772
+
1773
+ if self.web_search_enabled and self.search_client:
1774
+ tools = [{
1775
+ "type": "function",
1776
+ "function": {
1777
+ "name": "web_search",
1778
+ "description": "Search the web for current information. Use this for weather, news, prices, current events, or any time-sensitive topics.",
1779
+ "parameters": {
1780
+ "type": "object",
1781
+ "properties": {
1782
+ "query": {
1783
+ "type": "string",
1784
+ "description": "The search query"
1785
+ }
1786
+ },
1787
+ "required": ["query"]
1788
+ }
1789
+ }
1790
+ }]
1791
+ print("Web search function added to tools")
1792
+
1793
+ search_instructions = (
1794
+ "\n\nYou have web search capabilities. "
1795
+ "IMPORTANT: You MUST use the web_search function for ANY of these topics:\n"
1796
+ "- Weather (날씨, 기온, 비, 눈)\n"
1797
+ "- News (뉴스, 소식)\n"
1798
+ "- Current events (현재, 최근, 오늘, 지금)\n"
1799
+ "- Prices (가격, 환율, 주가)\n"
1800
+ "- Sports scores or results\n"
1801
+ "- Any question about 2024 or 2025\n"
1802
+ "- Any time-sensitive information\n\n"
1803
+ "When in doubt, USE web_search. It's better to search and provide accurate information "
1804
+ "than to guess or use outdated information."
1805
+ )
1806
+
1807
+ # Combine all instructions
1808
+ if translation_instructions:
1809
+ # Translation instructions already include base_instructions
1810
+ instructions = translation_instructions + search_instructions
1811
+ else:
1812
+ instructions = base_instructions + search_instructions
1813
+ else:
1814
+ # No web search
1815
+ if translation_instructions:
1816
+ instructions = translation_instructions
1817
+ else:
1818
+ instructions = base_instructions
1819
+
1820
+ print(f"[NORMAL MODE] Base instructions: {base_instructions[:100]}...")
1821
+ print(f"[NORMAL MODE] Translation instructions: {translation_instructions[:200] if translation_instructions else 'None'}...")
1822
+ print(f"[NORMAL MODE] Combined instructions length: {len(instructions)}")
1823
+ print(f"[NORMAL MODE] Target language: {self.target_language}")
1824
+
1825
+ async with self.client.beta.realtime.connect(
1826
+ model="gpt-4o-mini-realtime-preview-2024-12-17"
1827
+ ) as conn:
1828
+ # Update session with tools
1829
+ session_update = {
1830
+ "turn_detection": {"type": "server_vad"},
1831
+ "instructions": instructions,
1832
+ "tools": tools,
1833
+ "tool_choice": "auto" if tools else "none",
1834
+ "temperature": 0.7,
1835
+ "max_response_output_tokens": 4096,
1836
+ "modalities": ["text", "audio"],
1837
+ "voice": "alloy" # Default voice
1838
+ }
1839
+
1840
+ # Use appropriate voice for the language
1841
+ if self.target_language:
1842
+ # Force language through multiple mechanisms
1843
+ # 1. Use voice that's known to work well with the language
1844
+ voice_map = {
1845
+ "en": "nova", # Nova has clearer English
1846
+ "es": "nova", # Nova works for Spanish
1847
+ "fr": "shimmer", # Shimmer for French
1848
+ "de": "echo", # Echo for German
1849
+ "ja": "alloy", # Alloy can do Japanese
1850
+ "zh": "alloy", # Alloy can do Chinese
1851
+ "ko": "nova", # Nova for Korean
1852
+ }
1853
+ session_update["voice"] = voice_map.get(self.target_language, "nova")
1854
+
1855
+ # 2. Add language to modalities (experimental)
1856
+ session_update["modalities"] = ["text", "audio"]
1857
+
1858
+ # 3. Set output format
1859
+ session_update["output_audio_format"] = "pcm16"
1860
+
1861
+ # 4. Add language hint to the system (if supported by API)
1862
+ if self.target_language in ["en", "es", "fr", "de", "ja", "zh"]:
1863
+ session_update["language"] = self.target_language # Try setting language directly
1864
+
1865
+ print(f"[TRANSLATION MODE] Session update: {json.dumps(session_update, indent=2)}")
1866
+
1867
+ await conn.session.update(session=session_update)
1868
+ self.connection = conn
1869
+ print(f"Connected with tools: {len(tools)} functions, voice: {session_update.get('voice', 'default')}")
1870
+
1871
+ async for event in self.connection:
1872
+ # Debug logging for function calls
1873
+ if event.type.startswith("response.function_call"):
1874
+ print(f"Function event: {event.type}")
1875
+
1876
+ if event.type == "response.audio_transcript.done":
1877
+ print(f"[RESPONSE] Transcript: {event.transcript[:100]}...")
1878
+ print(f"[RESPONSE] Expected language: {self.target_language}")
1879
+
1880
+ output_data = {
1881
+ "event": event,
1882
+ "language": SUPPORTED_LANGUAGES.get(self.target_language, "") if self.target_language else ""
1883
+ }
1884
+ await self.output_queue.put(AdditionalOutputs(output_data))
1885
+
1886
+ elif event.type == "response.audio.delta":
1887
+ await self.output_queue.put(
1888
+ (
1889
+ self.output_sample_rate,
1890
+ np.frombuffer(
1891
+ base64.b64decode(event.delta), dtype=np.int16
1892
+ ).reshape(1, -1),
1893
+ ),
1894
+ )
1895
+
1896
+ # Handle function calls (only in non-interpretation mode)
1897
+ elif event.type == "response.function_call_arguments.start" and not self.interpretation_mode:
1898
+ print(f"Function call started")
1899
+ self.function_call_in_progress = True
1900
+ self.current_function_args = ""
1901
+ self.current_call_id = getattr(event, 'call_id', None)
1902
+
1903
+ elif event.type == "response.function_call_arguments.delta" and not self.interpretation_mode:
1904
+ if self.function_call_in_progress:
1905
+ self.current_function_args += event.delta
1906
+
1907
+ elif event.type == "response.function_call_arguments.done" and not self.interpretation_mode:
1908
+ if self.function_call_in_progress:
1909
+ print(f"Function call done, args: {self.current_function_args}")
1910
+ try:
1911
+ args = json.loads(self.current_function_args)
1912
+ query = args.get("query", "")
1913
+
1914
+ # Emit search event to client
1915
+ await self.output_queue.put(AdditionalOutputs({
1916
+ "type": "search",
1917
+ "query": query
1918
+ }))
1919
+
1920
+ # Perform the search
1921
+ search_results = await self.search_web(query)
1922
+ print(f"Search results length: {len(search_results)}")
1923
+
1924
+ # Send function result back to the model
1925
+ if self.connection and self.current_call_id:
1926
+ await self.connection.conversation.item.create(
1927
+ item={
1928
+ "type": "function_call_output",
1929
+ "call_id": self.current_call_id,
1930
+ "output": search_results
1931
+ }
1932
+ )
1933
+ await self.connection.response.create()
1934
+
1935
+ except Exception as e:
1936
+ print(f"Function call error: {e}")
1937
+ finally:
1938
+ self.function_call_in_progress = False
1939
+ self.current_function_args = ""
1940
+ self.current_call_id = None
1941
+
1942
+ async def receive(self, frame: tuple[int, np.ndarray]) -> None:
1943
+ if self.interpretation_mode:
1944
+ # In interpretation mode, buffer audio and process with Whisper
1945
+ _, array = frame
1946
+ array = array.squeeze()
1947
+
1948
+ # Simple voice activity detection
1949
+ audio_level = np.abs(array).mean()
1950
+
1951
+ if audio_level > 200: # Lower threshold for better detection
1952
+ if not self.is_recording:
1953
+ print(f"[INTERPRETATION] Started recording, level: {audio_level:.1f}")
1954
+ self.is_recording = True
1955
+ self.silence_frames = 0
1956
+ self.audio_buffer.append(array)
1957
+ elif self.is_recording:
1958
+ self.silence_frames += 1
1959
+ self.audio_buffer.append(array)
1960
+
1961
+ # If we've had enough silence, process the audio
1962
+ if self.silence_frames > self.silence_threshold and len(self.audio_buffer) > self.min_audio_length:
1963
+ print(f"[INTERPRETATION] Silence detected after {len(self.audio_buffer)} frames")
1964
+ # Process in the background to avoid blocking
1965
+ asyncio.create_task(self.process_interpretation())
1966
+ else:
1967
+ # Normal mode - use Realtime API
1968
+ if not self.connection:
1969
+ return
1970
+ try:
1971
+ _, array = frame
1972
+ array = array.squeeze()
1973
+ audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
1974
+ await self.connection.input_audio_buffer.append(audio=audio_message)
1975
+ except Exception as e:
1976
+ print(f"Error in receive: {e}")
1977
+ # Connection might be closed, ignore the error
1978
+
1979
+ async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
1980
+ # In interpretation mode, we need to keep checking for audio
1981
+ if self.interpretation_mode:
1982
+ # Use a timeout to prevent blocking forever
1983
+ try:
1984
+ item = await asyncio.wait_for(wait_for_item(self.output_queue), timeout=0.1)
1985
+ return item
1986
+ except asyncio.TimeoutError:
1987
+ return None
1988
+ else:
1989
+ # Normal mode
1990
+ item = await wait_for_item(self.output_queue)
1991
+
1992
+ # Check if it's a dict with text message
1993
+ if isinstance(item, dict) and item.get('type') == 'text_message':
1994
+ await self.process_text_message(item['content'])
1995
+ return None
1996
+
1997
+ return item
1998
+
1999
+ async def shutdown(self) -> None:
2000
+ if self.interpretation_mode:
2001
+ # Clean up interpretation mode
2002
+ self.audio_buffer = []
2003
+ self.is_recording = False
2004
+ print("[INTERPRETATION MODE] Shutdown complete")
2005
+ else:
2006
+ # Normal mode - close Realtime API connection
2007
+ if self.connection:
2008
+ await self.connection.close()
2009
+ self.connection = None
2010
+
2011
+
2012
+ # Create initial handler instance
2013
+ handler = OpenAIHandler(web_search_enabled=False, interpretation_mode=False)
2014
+
2015
+ # Create components
2016
+ chatbot = gr.Chatbot(type="messages")
2017
+
2018
+ # Create stream with handler instance
2019
+ stream = Stream(
2020
+ handler, # Pass instance, not factory
2021
+ mode="send-receive",
2022
+ modality="audio",
2023
+ additional_inputs=[chatbot],
2024
+ additional_outputs=[chatbot],
2025
+ additional_outputs_handler=update_chatbot,
2026
+ rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
2027
+ concurrency_limit=5 if get_space() else None,
2028
+ time_limit=300 if get_space() else None,
2029
+ )
2030
+
2031
+ app = FastAPI()
2032
+
2033
+ # Mount stream
2034
+ stream.mount(app)
2035
+
2036
+ # Intercept offer to capture settings
2037
+ @app.post("/webrtc/offer", include_in_schema=False)
2038
+ async def custom_offer(request: Request):
2039
+ """Intercept offer to capture settings"""
2040
+ body = await request.json()
2041
+
2042
+ webrtc_id = body.get("webrtc_id")
2043
+ web_search_enabled = body.get("web_search_enabled", False)
2044
+ target_language = body.get("target_language", "")
2045
+ system_prompt = body.get("system_prompt", "")
2046
+ interpretation_mode = body.get("interpretation_mode", False)
2047
+ interpretation_language = body.get("interpretation_language", "")
2048
+
2049
+ print(f"Custom offer - webrtc_id: {webrtc_id}, web_search_enabled: {web_search_enabled}, "
2050
+ f"target_language: {target_language}, interpretation_mode: {interpretation_mode}, "
2051
+ f"interpretation_language: {interpretation_language}")
2052
+
2053
+ # Store settings with timestamp
2054
+ if webrtc_id:
2055
+ connection_settings[webrtc_id] = {
2056
+ 'web_search_enabled': web_search_enabled,
2057
+ 'target_language': target_language,
2058
+ 'system_prompt': system_prompt,
2059
+ 'interpretation_mode': interpretation_mode,
2060
+ 'interpretation_language': interpretation_language,
2061
+ 'timestamp': asyncio.get_event_loop().time()
2062
+ }
2063
+
2064
+ # Remove our custom route temporarily
2065
+ custom_route = None
2066
+ for i, route in enumerate(app.routes):
2067
+ if hasattr(route, 'path') and route.path == "/webrtc/offer" and route.endpoint == custom_offer:
2068
+ custom_route = app.routes.pop(i)
2069
+ break
2070
+
2071
+ # Forward to stream's offer handler
2072
+ response = await stream.offer(body)
2073
+
2074
+ # Re-add our custom route
2075
+ if custom_route:
2076
+ app.routes.insert(0, custom_route)
2077
+
2078
+ return response
2079
+
2080
+
2081
+ @app.post("/chat/text")
2082
+ async def chat_text(request: Request):
2083
+ """Handle text chat messages using GPT-4o-mini"""
2084
+ try:
2085
+ body = await request.json()
2086
+ message = body.get("message", "")
2087
+ web_search_enabled = body.get("web_search_enabled", False)
2088
+ target_language = body.get("target_language", "")
2089
+ system_prompt = body.get("system_prompt", "")
2090
+
2091
+ if not message:
2092
+ return {"error": "메시지가 비어있습니다."}
2093
+
2094
+ # Process text chat
2095
+ result = await process_text_chat(message, web_search_enabled, target_language, system_prompt)
2096
+
2097
+ return result
2098
+
2099
+ except Exception as e:
2100
+ print(f"Error in chat_text endpoint: {e}")
2101
+ return {"error": "채팅 처리 중 오류가 발생했습니다."}
2102
+
2103
+
2104
+ @app.post("/text_message/{webrtc_id}")
2105
+ async def receive_text_message(webrtc_id: str, request: Request):
2106
+ """Receive text message from client"""
2107
+ body = await request.json()
2108
+ message = body.get("content", "")
2109
+
2110
+ # Find the handler for this connection
2111
+ if webrtc_id in stream.handlers:
2112
+ handler = stream.handlers[webrtc_id]
2113
+ # Queue the text message for processing
2114
+ await handler.output_queue.put({
2115
+ 'type': 'text_message',
2116
+ 'content': message
2117
+ })
2118
+
2119
+ return {"status": "ok"}
2120
+
2121
+
2122
+ @app.get("/outputs")
2123
+ async def outputs(webrtc_id: str):
2124
+ """Stream outputs including search events"""
2125
+ async def output_stream():
2126
+ async for output in stream.output_stream(webrtc_id):
2127
+ if hasattr(output, 'args') and output.args:
2128
+ # Check if it's a search event
2129
+ if isinstance(output.args[0], dict) and output.args[0].get('type') == 'search':
2130
+ yield f"event: search\ndata: {json.dumps(output.args[0])}\n\n"
2131
+ # Regular transcript event with language info
2132
+ elif isinstance(output.args[0], dict) and 'event' in output.args[0]:
2133
+ event = output.args[0]['event']
2134
+ if hasattr(event, 'transcript'):
2135
+ data = {
2136
+ "role": "assistant",
2137
+ "content": event.transcript,
2138
+ "language": output.args[0].get('language', ''),
2139
+ "mode": output.args[0].get('mode', 'normal')
2140
+ }
2141
+ yield f"event: output\ndata: {json.dumps(data)}\n\n"
2142
+
2143
+ return StreamingResponse(output_stream(), media_type="text/event-stream")
2144
+
2145
+
2146
+ @app.get("/")
2147
+ async def index():
2148
+ """Serve the HTML page"""
2149
+ rtc_config = get_twilio_turn_credentials() if get_space() else None
2150
+ html_content = HTML_CONTENT.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config))
2151
+ return HTMLResponse(content=html_content)
2152
+
2153
+
2154
+ if __name__ == "__main__":
2155
+ import uvicorn
2156
+
2157
+ mode = os.getenv("MODE")
2158
+ if mode == "UI":
2159
+ stream.ui.launch(server_port=7860)
2160
+ elif mode == "PHONE":
2161
+ stream.fastphone(host="0.0.0.0", port=7860)
2162
+ else:
2163
+ uvicorn.run(app, host="0.0.0.0", port=7860)