openfree commited on
Commit
1efe32e
ยท
verified ยท
1 Parent(s): c33cb88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +399 -318
app.py CHANGED
@@ -1,327 +1,408 @@
1
- import gradio as gr
2
- import openai
3
- from openai import OpenAI
4
- import numpy as np
5
- import threading
6
- import queue
7
- import time
8
- import json
9
- import websocket
10
- import base64
11
- import pyaudio
12
- import wave
13
- import io
14
- from typing import Generator, Tuple
15
- import asyncio
16
- import edge_tts
17
-
18
- # OpenAI API ํ‚ค ์„ค์ •
19
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
20
- client = OpenAI(api_key=OPENAI_API_KEY)
21
-
22
- class RealtimeTranslator:
23
- def __init__(self):
24
- self.is_recording = False
25
- self.audio_queue = queue.Queue()
26
- self.text_queue = queue.Queue()
27
- self.translation_queue = queue.Queue()
28
- self.current_text = ""
29
- self.detected_language = None
30
-
31
- def detect_language(self, text: str) -> str:
32
- """ํ…์ŠคํŠธ์˜ ์–ธ์–ด๋ฅผ ๊ฐ์ง€ํ•ฉ๋‹ˆ๋‹ค."""
33
- korean_chars = sum(1 for char in text if ord('๊ฐ€') <= ord(char) <= ord('ํžฃ'))
34
- total_chars = len(text.replace(" ", ""))
35
-
36
- if total_chars > 0:
37
- korean_ratio = korean_chars / total_chars
38
- if korean_ratio > 0.3:
39
- return "ko"
40
- return "en"
41
-
42
- def process_audio_chunk(self, audio_chunk):
43
- """์˜ค๋””์˜ค ์ฒญํฌ๋ฅผ ์ฒ˜๋ฆฌํ•˜์—ฌ ํ…์ŠคํŠธ๋กœ ๋ณ€ํ™˜"""
44
- try:
45
- # ์˜ค๋””์˜ค ์ฒญํฌ๋ฅผ ์ž„์‹œ ํŒŒ์ผ๋กœ ์ €์žฅ
46
- import tempfile
47
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
48
- # WAV ํŒŒ์ผ๋กœ ์ €์žฅ
49
- import wave
50
- with wave.open(tmp_file.name, 'wb') as wav_file:
51
- wav_file.setnchannels(1)
52
- wav_file.setsampwidth(2)
53
- wav_file.setframerate(16000)
54
- wav_file.writeframes(audio_chunk)
55
-
56
- # Whisper API ํ˜ธ์ถœ
57
- with open(tmp_file.name, "rb") as audio_file:
58
- transcript = client.audio.transcriptions.create(
59
- model="whisper-1",
60
- file=audio_file,
61
- language=None,
62
- prompt="์‹ค์‹œ๊ฐ„ ๋Œ€ํ™”๋ฅผ ๋ฒˆ์—ญํ•ฉ๋‹ˆ๋‹ค."
63
- )
64
-
65
- return transcript.text
66
-
67
- except Exception as e:
68
- print(f"์Œ์„ฑ ์ธ์‹ ์˜ค๋ฅ˜: {e}")
69
- return ""
70
-
71
- def translate_stream(self, text: str, source_lang: str) -> str:
72
- """ํ…์ŠคํŠธ๋ฅผ ์‹ค์‹œ๊ฐ„์œผ๋กœ ๋ฒˆ์—ญ"""
73
- try:
74
- if not text or text.strip() == "":
75
- return ""
76
-
77
- # ๋ฒˆ์—ญ ํ”„๋กฌํ”„ํŠธ
78
- if source_lang == "ko":
79
- messages = [
80
- {"role": "system", "content": "์‹ค์‹œ๊ฐ„ ํ†ต์—ญ์‚ฌ์ž…๋‹ˆ๋‹ค. ํ•œ๊ตญ์–ด๋ฅผ ์˜์–ด๋กœ ์ฆ‰์‹œ ๋ฒˆ์—ญํ•ฉ๋‹ˆ๋‹ค."},
81
- {"role": "user", "content": text}
82
- ]
83
- else:
84
- messages = [
85
- {"role": "system", "content": "์‹ค์‹œ๊ฐ„ ํ†ต์—ญ์‚ฌ์ž…๋‹ˆ๋‹ค. ์˜์–ด๋ฅผ ํ•œ๊ตญ์–ด๋กœ ์ฆ‰์‹œ ๋ฒˆ์—ญํ•ฉ๋‹ˆ๋‹ค."},
86
- {"role": "user", "content": text}
87
- ]
88
-
89
- # ์ŠคํŠธ๋ฆฌ๋ฐ ์‘๋‹ต
90
- stream = client.chat.completions.create(
91
- model="gpt-4o-mini",
92
- messages=messages,
93
- stream=True,
94
- temperature=0.3,
95
- max_tokens=150
96
- )
97
-
98
- translated = ""
99
- for chunk in stream:
100
- if chunk.choices[0].delta.content:
101
- translated += chunk.choices[0].delta.content
102
-
103
- return translated
104
-
105
- except Exception as e:
106
- print(f"๋ฒˆ์—ญ ์˜ค๋ฅ˜: {e}")
107
- return ""
108
 
109
- translator = RealtimeTranslator()
 
 
 
 
 
 
 
 
 
110
 
111
- def process_stream(audio_stream):
112
- """์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ์„ ์‹ค์‹œ๊ฐ„์œผ๋กœ ์ฒ˜๋ฆฌ"""
113
- if audio_stream is None:
114
- yield "๐Ÿ”ด ๋งˆ์ดํฌ๋ฅผ ์ผœ๊ณ  ๋ง์”€ํ•ด์ฃผ์„ธ์š”", "", ""
115
- return
116
-
117
- sample_rate, audio_data = audio_stream
118
-
119
- # ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ๊ฐ€ ๋„ˆ๋ฌด ์งง์œผ๋ฉด ๋ฌด์‹œ
120
- if len(audio_data) < sample_rate * 0.5: # 0.5์ดˆ ๋ฏธ๋งŒ
121
- yield "๐ŸŽค ๋“ฃ๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค...", "", ""
122
- return
123
-
124
- # ์˜ค๋””์˜ค ์ฒญํฌ ์ฒ˜๋ฆฌ
125
- audio_bytes = audio_data.tobytes()
126
-
127
- # ์Œ์„ฑ์„ ํ…์ŠคํŠธ๋กœ ๋ณ€ํ™˜
128
- text = translator.process_audio_chunk(audio_bytes)
129
-
130
- if text:
131
- # ์–ธ์–ด ๊ฐ์ง€
132
- detected_lang = translator.detect_language(text)
133
-
134
- # ์‹ค์‹œ๊ฐ„ ๋ฒˆ์—ญ
135
- translated = translator.translate_stream(text, detected_lang)
136
-
137
- # ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜
138
- if detected_lang == "ko":
139
- yield f"โœ… ํ•œ๊ตญ์–ด ๊ฐ์ง€", text, translated
140
- else:
141
- yield f"โœ… English detected", translated, text
142
-
143
- def create_realtime_interface():
144
- with gr.Blocks(title="์‹ค์‹œ๊ฐ„ ์Œ์„ฑ ๋ฒˆ์—ญ๊ธฐ", theme=gr.themes.Soft()) as demo:
145
- gr.Markdown(
146
- """
147
- # ๐ŸŽค ์‹ค์‹œ๊ฐ„ ์Œ์„ฑ ๋ฒˆ์—ญ๊ธฐ (Real-time Voice Translator)
148
-
149
- ### ๋งํ•˜๋Š” ๋™์•ˆ ์‹ค์‹œ๊ฐ„์œผ๋กœ ๋ฒˆ์—ญ๋ฉ๋‹ˆ๋‹ค!
150
-
151
- ๐Ÿ”ด **์‹œ์ž‘** ๋ฒ„ํŠผ์„ ๋ˆ„๋ฅด๊ณ  ๋งํ•˜๋ฉด, ์‹ค์‹œ๊ฐ„์œผ๋กœ ๋ฒˆ์—ญ์ด ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.
152
-
153
- ---
154
- """
155
- )
156
-
157
- with gr.Row():
158
- with gr.Column(scale=2):
159
- audio_input = gr.Audio(
160
- source="microphone",
161
- type="numpy",
162
- streaming=True, # ์ŠคํŠธ๋ฆฌ๋ฐ ๋ชจ๋“œ ํ™œ์„ฑํ™”
163
- label="๐ŸŽค ์‹ค์‹œ๊ฐ„ ๋งˆ์ดํฌ ์ž…๋ ฅ",
164
- elem_id="audio-stream"
165
- )
166
-
167
- with gr.Column(scale=1):
168
- status_text = gr.Textbox(
169
- label="๐Ÿ“Š ์ƒํƒœ",
170
- value="๐Ÿ”ด ๋งˆ์ดํฌ๋ฅผ ์ผœ๊ณ  ๋ง์”€ํ•ด์ฃผ์„ธ์š”",
171
- interactive=False
172
- )
173
-
174
- with gr.Row():
175
- with gr.Column():
176
- korean_output = gr.Textbox(
177
- label="๐Ÿ‡ฐ๐Ÿ‡ท ํ•œ๊ตญ์–ด",
178
- placeholder="ํ•œ๊ตญ์–ด๊ฐ€ ์‹ค์‹œ๊ฐ„์œผ๋กœ ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค",
179
- lines=8,
180
- interactive=False,
181
- elem_id="korean-text"
182
- )
183
-
184
- with gr.Column():
185
- english_output = gr.Textbox(
186
- label="๐Ÿ‡บ๐Ÿ‡ธ English",
187
- placeholder="English translation appears here in real-time",
188
- lines=8,
189
- interactive=False,
190
- elem_id="english-text"
191
- )
192
-
193
- # ์ŠคํŠธ๋ฆฌ๋ฐ ์ด๋ฒคํŠธ ์„ค์ •
194
- audio_input.stream(
195
- fn=process_stream,
196
- inputs=[audio_input],
197
- outputs=[status_text, korean_output, english_output],
198
- show_progress=False
199
- )
200
-
201
- gr.Markdown(
202
- """
203
- ---
204
-
205
- ### ๐Ÿ’ก ์‚ฌ์šฉ ํŒ:
206
- - ๋ช…ํ™•ํ•˜๊ฒŒ ๋งํ• ์ˆ˜๋ก ์ธ์‹๋ฅ ์ด ๋†’์•„์ง‘๋‹ˆ๋‹ค
207
- - ๋ฌธ์žฅ์ด ๋๋‚  ๋•Œ๊นŒ์ง€ ์ž ์‹œ ๋ฉˆ์ถ”๋ฉด ๋” ์ •ํ™•ํ•œ ๋ฒˆ์—ญ์ด ๋ฉ๋‹ˆ๋‹ค
208
- - ํ•œ๊ตญ์–ด์™€ ์˜์–ด๋ฅผ ์ž๋™์œผ๋กœ ๊ฐ์ง€ํ•ฉ๋‹ˆ๋‹ค
209
-
210
- ### โš™๏ธ ๊ธฐ์ˆ  ์‚ฌ์–‘:
211
- - **์Œ์„ฑ ์ธ์‹**: OpenAI Whisper (์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ)
212
- - **๋ฒˆ์—ญ**: GPT-4 (์ŠคํŠธ๋ฆฌ๋ฐ ๋ชจ๋“œ)
213
- - **์ง€์—ฐ ์‹œ๊ฐ„**: ~1-2์ดˆ
214
- """
215
- )
216
 
217
- # CSS ์Šคํƒ€์ผ ์ถ”๊ฐ€
218
- demo.css = """
219
- #audio-stream {
220
- height: 150px !important;
221
- }
222
- #korean-text, #english-text {
223
- font-size: 18px !important;
224
- line-height: 1.5 !important;
225
- }
226
- .gradio-container {
227
- max-width: 1200px !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  }
229
- """
230
-
231
- return demo
232
-
233
- # ๋Œ€์•ˆ: WebSocket ๊ธฐ๋ฐ˜ ์‹ค์‹œ๊ฐ„ ๋ฒˆ์—ญ (๋” ๋‚ฎ์€ ์ง€์—ฐ์‹œ๊ฐ„)
234
- class WebSocketTranslator:
235
- def __init__(self):
236
- self.ws_url = "wss://api.openai.com/v1/realtime" # ์˜ˆ์‹œ URL
237
- self.ws = None
238
- self.is_connected = False
239
-
240
- def connect(self):
241
- """WebSocket ์—ฐ๊ฒฐ"""
242
- headers = {
243
- "Authorization": f"Bearer {OPENAI_API_KEY}",
244
- "OpenAI-Beta": "realtime=v1"
 
 
 
 
 
 
 
 
 
 
 
245
  }
246
-
247
- try:
248
- self.ws = websocket.WebSocketApp(
249
- self.ws_url,
250
- header=headers,
251
- on_open=self.on_open,
252
- on_message=self.on_message,
253
- on_error=self.on_error,
254
- on_close=self.on_close
255
- )
256
-
257
- # ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ ์‹คํ–‰
258
- wst = threading.Thread(target=self.ws.run_forever)
259
- wst.daemon = True
260
- wst.start()
261
-
262
- except Exception as e:
263
- print(f"WebSocket ์—ฐ๊ฒฐ ์˜ค๋ฅ˜: {e}")
264
-
265
- def on_open(self, ws):
266
- self.is_connected = True
267
- print("WebSocket ์—ฐ๊ฒฐ๋จ")
268
-
269
- def on_message(self, ws, message):
270
- """๋ฉ”์‹œ์ง€ ์ˆ˜์‹  ์ฒ˜๋ฆฌ"""
271
- try:
272
- data = json.loads(message)
273
- if data.get("type") == "transcription":
274
- # ์‹ค์‹œ๊ฐ„ ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ
275
- text = data.get("text", "")
276
- self.process_realtime_text(text)
277
- except Exception as e:
278
- print(f"๋ฉ”์‹œ์ง€ ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {e}")
 
 
 
 
 
 
 
 
 
 
279
 
280
- def on_error(self, ws, error):
281
- print(f"WebSocket ์˜ค๋ฅ˜: {error}")
282
-
283
- def on_close(self, ws, close_status_code, close_msg):
284
- self.is_connected = False
285
- print("WebSocket ์—ฐ๊ฒฐ ์ข…๋ฃŒ")
 
 
 
 
 
 
286
 
287
- def send_audio(self, audio_data):
288
- """์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ์ „์†ก"""
289
- if self.is_connected and self.ws:
290
- # ์˜ค๋””์˜ค๋ฅผ base64๋กœ ์ธ์ฝ”๋”ฉ
291
- audio_base64 = base64.b64encode(audio_data).decode('utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
- message = {
294
- "type": "audio",
295
- "audio": audio_base64
296
- }
 
 
297
 
298
- self.ws.send(json.dumps(message))
299
-
300
- def process_realtime_text(self, text):
301
- """์‹ค์‹œ๊ฐ„ ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ ๋ฐ ๋ฒˆ์—ญ"""
302
- # ์–ธ์–ด ๊ฐ์ง€ ๋ฐ ๋ฒˆ์—ญ ๋กœ์ง
303
- pass
304
-
305
- # ๋ฉ”์ธ ์‹คํ–‰
306
- if __name__ == "__main__":
307
- import os
308
-
309
- # API ํ‚ค ํ™•์ธ
310
- if OPENAI_API_KEY == "your-api-key-here":
311
- api_key = os.getenv("OPENAI_API_KEY")
312
- if api_key:
313
- OPENAI_API_KEY = api_key
314
- client = OpenAI(api_key=OPENAI_API_KEY)
315
- else:
316
- print("โš ๏ธ ๊ฒฝ๊ณ : OpenAI API ํ‚ค๋ฅผ ์„ค์ •ํ•ด์ฃผ์„ธ์š”!")
317
- print("ํ™˜๊ฒฝ ๋ณ€์ˆ˜ OPENAI_API_KEY๋ฅผ ์„ค์ •ํ•˜๊ฑฐ๋‚˜ ์ฝ”๋“œ์— ์ง์ ‘ ์ž…๋ ฅํ•˜์„ธ์š”.")
318
-
319
- # Gradio ์•ฑ ์‹คํ–‰
320
- demo = create_realtime_interface()
321
- demo.queue() # ํ ํ™œ์„ฑํ™” (์ŠคํŠธ๋ฆฌ๋ฐ์— ํ•„์š”)
322
- demo.launch(
323
- share=False,
324
- server_name="0.0.0.0",
325
- server_port=7860,
326
- debug=True
327
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React, { useState, useRef, useEffect } from 'react';
2
+ import { Mic, MicOff, Volume2, Globe, Loader2 } from 'lucide-react';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ const RealtimeVoiceTranslator = () => {
5
+ const [isConnected, setIsConnected] = useState(false);
6
+ const [isRecording, setIsRecording] = useState(false);
7
+ const [transcript, setTranscript] = useState('');
8
+ const [translation, setTranslation] = useState('');
9
+ const [sourceLanguage, setSourceLanguage] = useState('ko');
10
+ const [targetLanguage, setTargetLanguage] = useState('en');
11
+ const [ephemeralKey, setEphemeralKey] = useState('');
12
+ const [logs, setLogs] = useState([]);
13
+ const [isLoading, setIsLoading] = useState(false);
14
 
15
+ const pcRef = useRef(null);
16
+ const dcRef = useRef(null);
17
+ const audioElRef = useRef(null);
18
+
19
+ // ๋กœ๊ทธ ์ถ”๊ฐ€ ํ•จ์ˆ˜
20
+ const addLog = (message, type = 'info') => {
21
+ const timestamp = new Date().toLocaleTimeString();
22
+ setLogs(prev => [...prev, { message, type, timestamp }]);
23
+ };
24
+
25
+ // Ephemeral key ๊ฐ€์ ธ์˜ค๊ธฐ (์‹œ๋ฎฌ๋ ˆ์ด์…˜)
26
+ const getEphemeralKey = async () => {
27
+ // ์‹ค์ œ ๊ตฌํ˜„์—์„œ๋Š” ์„œ๋ฒ„์—์„œ ephemeral key๋ฅผ ๊ฐ€์ ธ์™€์•ผ ํ•ฉ๋‹ˆ๋‹ค
28
+ addLog('Ephemeral key๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค. ์„œ๋ฒ„ ์—”๋“œํฌ์ธํŠธ๋ฅผ ๊ตฌํ˜„ํ•ด์ฃผ์„ธ์š”.', 'warning');
29
+ return 'YOUR_EPHEMERAL_KEY';
30
+ };
31
+
32
+ // WebRTC ์—ฐ๊ฒฐ ์ดˆ๊ธฐํ™”
33
+ const initializeConnection = async () => {
34
+ try {
35
+ setIsLoading(true);
36
+ addLog('์—ฐ๊ฒฐ ์ดˆ๊ธฐํ™” ์ค‘...', 'info');
37
+
38
+ // Ephemeral key ๊ฐ€์ ธ์˜ค๊ธฐ
39
+ const key = await getEphemeralKey();
40
+ setEphemeralKey(key);
41
+
42
+ // Peer connection ์ƒ์„ฑ
43
+ const pc = new RTCPeerConnection({
44
+ iceServers: [{ urls: 'stun:stun.l.google.com:19302' }]
45
+ });
46
+ pcRef.current = pc;
47
+
48
+ // ์›๊ฒฉ ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์„ค์ •
49
+ const audioEl = audioElRef.current;
50
+ pc.ontrack = (e) => {
51
+ audioEl.srcObject = e.streams[0];
52
+ addLog('์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์—ฐ๊ฒฐ๋จ', 'success');
53
+ };
54
+
55
+ // ๋กœ์ปฌ ์˜ค๋””์˜ค ํŠธ๋ž™ ์ถ”๊ฐ€
56
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
57
+ stream.getTracks().forEach(track => {
58
+ pc.addTrack(track, stream);
59
+ });
60
+
61
+ // ๋ฐ์ดํ„ฐ ์ฑ„๋„ ์„ค์ •
62
+ const dc = pc.createDataChannel('oai-events');
63
+ dcRef.current = dc;
64
+
65
+ dc.addEventListener('open', () => {
66
+ addLog('๋ฐ์ดํ„ฐ ์ฑ„๋„ ์—ด๋ฆผ', 'success');
67
+ setIsConnected(true);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ // ์„ธ์…˜ ์„ค์ •
70
+ const sessionConfig = {
71
+ type: 'session.update',
72
+ session: {
73
+ modalities: ['text', 'audio'],
74
+ instructions: `You are a helpful translator. Translate between ${sourceLanguage === 'ko' ? 'Korean' : 'English'} and ${targetLanguage === 'ko' ? 'Korean' : 'English'}.`,
75
+ voice: 'alloy',
76
+ input_audio_format: 'pcm16',
77
+ output_audio_format: 'pcm16',
78
+ input_audio_transcription: {
79
+ model: 'whisper-1'
80
+ },
81
+ turn_detection: {
82
+ type: 'server_vad',
83
+ threshold: 0.5,
84
+ prefix_padding_ms: 300,
85
+ silence_duration_ms: 500
86
+ }
87
+ }
88
+ };
89
+ dc.send(JSON.stringify(sessionConfig));
90
+ });
91
+
92
+ dc.addEventListener('message', (e) => {
93
+ const event = JSON.parse(e.data);
94
+ handleRealtimeEvent(event);
95
+ });
96
+
97
+ dc.addEventListener('error', (error) => {
98
+ addLog(`๋ฐ์ดํ„ฐ ์ฑ„๋„ ์˜ค๋ฅ˜: ${error}`, 'error');
99
+ });
100
+
101
+ // SDP ์˜คํผ ์ƒ์„ฑ ๋ฐ ์—ฐ๊ฒฐ
102
+ const offer = await pc.createOffer();
103
+ await pc.setLocalDescription(offer);
104
+
105
+ // API ์—ฐ๊ฒฐ (์‹ค์ œ ๊ตฌํ˜„ ์‹œ ์‚ฌ์šฉ)
106
+ /*
107
+ const baseUrl = 'https://api.openai.com/v1/realtime';
108
+ const model = 'gpt-4o-realtime-preview-2025-06-03';
109
+ const response = await fetch(`${baseUrl}?model=${model}`, {
110
+ method: 'POST',
111
+ body: offer.sdp,
112
+ headers: {
113
+ 'Authorization': `Bearer ${key}`,
114
+ 'Content-Type': 'application/sdp'
115
  }
116
+ });
117
+
118
+ const answerSdp = await response.text();
119
+ const answer = {
120
+ type: 'answer',
121
+ sdp: answerSdp
122
+ };
123
+ await pc.setRemoteDescription(answer);
124
+ */
125
+
126
+ addLog('WebRTC ์—ฐ๊ฒฐ ์‹œ๋ฎฌ๋ ˆ์ด์…˜ ์™„๋ฃŒ', 'warning');
127
+ setIsConnected(true);
128
+ setIsLoading(false);
129
+
130
+ } catch (error) {
131
+ addLog(`์—ฐ๊ฒฐ ์˜ค๋ฅ˜: ${error.message}`, 'error');
132
+ setIsLoading(false);
133
+ }
134
+ };
135
+
136
+ // Realtime ์ด๋ฒคํŠธ ์ฒ˜๋ฆฌ
137
+ const handleRealtimeEvent = (event) => {
138
+ switch (event.type) {
139
+ case 'conversation.item.created':
140
+ if (event.item.role === 'user') {
141
+ setTranscript(event.item.content?.[0]?.transcript || '');
142
+ addLog(`์‚ฌ์šฉ์ž: ${event.item.content?.[0]?.transcript}`, 'info');
143
  }
144
+ break;
145
+
146
+ case 'conversation.item.input_audio_transcription.completed':
147
+ setTranscript(event.transcript);
148
+ addLog(`์ „์‚ฌ ์™„๋ฃŒ: ${event.transcript}`, 'info');
149
+ // ๋ฒˆ์—ญ ์š”์ฒญ
150
+ requestTranslation(event.transcript);
151
+ break;
152
+
153
+ case 'response.audio_transcript.delta':
154
+ setTranslation(prev => prev + event.delta);
155
+ break;
156
+
157
+ case 'response.audio_transcript.done':
158
+ addLog(`๋ฒˆ์—ญ ์™„๋ฃŒ: ${translation}`, 'success');
159
+ break;
160
+
161
+ case 'error':
162
+ addLog(`์˜ค๋ฅ˜: ${event.error.message}`, 'error');
163
+ break;
164
+
165
+ default:
166
+ console.log('Unhandled event:', event);
167
+ }
168
+ };
169
+
170
+ // ๋ฒˆ์—ญ ์š”์ฒญ
171
+ const requestTranslation = (text) => {
172
+ if (!dcRef.current || dcRef.current.readyState !== 'open') return;
173
+
174
+ const message = {
175
+ type: 'conversation.item.create',
176
+ item: {
177
+ type: 'message',
178
+ role: 'user',
179
+ content: [{
180
+ type: 'input_text',
181
+ text: `Translate this to ${targetLanguage === 'ko' ? 'Korean' : 'English'}: "${text}"`
182
+ }]
183
+ }
184
+ };
185
+
186
+ dcRef.current.send(JSON.stringify(message));
187
 
188
+ // ์‘๋‹ต ์ƒ์„ฑ ์š”์ฒญ
189
+ dcRef.current.send(JSON.stringify({ type: 'response.create' }));
190
+ };
191
+
192
+ // ๋…น์Œ ์‹œ์ž‘/์ค‘์ง€
193
+ const toggleRecording = () => {
194
+ if (!isConnected) {
195
+ addLog('๋จผ์ € ์—ฐ๊ฒฐ์„ ์‹œ์ž‘ํ•ด์ฃผ์„ธ์š”', 'warning');
196
+ return;
197
+ }
198
+
199
+ setIsRecording(!isRecording);
200
 
201
+ if (!isRecording) {
202
+ addLog('๋…น์Œ ์‹œ์ž‘', 'info');
203
+ // ์Œ์„ฑ ์ž…๋ ฅ ์‹œ์ž‘์„ ์œ„ํ•œ ์ด๋ฒคํŠธ ์ „์†ก
204
+ if (dcRef.current && dcRef.current.readyState === 'open') {
205
+ dcRef.current.send(JSON.stringify({
206
+ type: 'input_audio_buffer.commit'
207
+ }));
208
+ }
209
+ } else {
210
+ addLog('๋…น์Œ ์ค‘์ง€', 'info');
211
+ }
212
+ };
213
+
214
+ // ์–ธ์–ด ์ „ํ™˜
215
+ const swapLanguages = () => {
216
+ setSourceLanguage(targetLanguage);
217
+ setTargetLanguage(sourceLanguage);
218
+ addLog(`์–ธ์–ด ์ „ํ™˜: ${targetLanguage} โ†’ ${sourceLanguage}`, 'info');
219
+ };
220
+
221
+ // ์—ฐ๊ฒฐ ์ข…๋ฃŒ
222
+ const disconnect = () => {
223
+ if (pcRef.current) {
224
+ pcRef.current.close();
225
+ pcRef.current = null;
226
+ }
227
+ setIsConnected(false);
228
+ setIsRecording(false);
229
+ addLog('์—ฐ๊ฒฐ ์ข…๋ฃŒ๋จ', 'info');
230
+ };
231
+
232
+ // ์ปดํฌ๋„ŒํŠธ ์–ธ๋งˆ์šดํŠธ ์‹œ ์ •๋ฆฌ
233
+ useEffect(() => {
234
+ return () => {
235
+ if (pcRef.current) {
236
+ pcRef.current.close();
237
+ }
238
+ };
239
+ }, []);
240
+
241
+ return (
242
+ <div className="min-h-screen bg-gray-50 p-4">
243
+ <div className="max-w-4xl mx-auto">
244
+ <h1 className="text-3xl font-bold text-center mb-8 text-gray-800">
245
+ ์‹ค์‹œ๊ฐ„ ์Œ์„ฑ ๋ฒˆ์—ญ๊ธฐ
246
+ </h1>
247
+
248
+ {/* ์—ฐ๊ฒฐ ์ƒํƒœ */}
249
+ <div className="bg-white rounded-lg shadow-md p-6 mb-6">
250
+ <div className="flex items-center justify-between mb-4">
251
+ <h2 className="text-xl font-semibold">์—ฐ๊ฒฐ ์ƒํƒœ</h2>
252
+ <div className="flex items-center gap-2">
253
+ <div className={`w-3 h-3 rounded-full ${isConnected ? 'bg-green-500' : 'bg-red-500'}`} />
254
+ <span className="text-sm">{isConnected ? '์—ฐ๊ฒฐ๋จ' : '์—ฐ๊ฒฐ ์•ˆ๋จ'}</span>
255
+ </div>
256
+ </div>
257
+
258
+ <div className="flex gap-4">
259
+ {!isConnected ? (
260
+ <button
261
+ onClick={initializeConnection}
262
+ disabled={isLoading}
263
+ className="flex items-center gap-2 px-4 py-2 bg-blue-500 text-white rounded-lg hover:bg-blue-600 disabled:bg-gray-400"
264
+ >
265
+ {isLoading ? (
266
+ <>
267
+ <Loader2 className="w-4 h-4 animate-spin" />
268
+ ์—ฐ๊ฒฐ ์ค‘...
269
+ </>
270
+ ) : (
271
+ '์—ฐ๊ฒฐ ์‹œ์ž‘'
272
+ )}
273
+ </button>
274
+ ) : (
275
+ <button
276
+ onClick={disconnect}
277
+ className="px-4 py-2 bg-red-500 text-white rounded-lg hover:bg-red-600"
278
+ >
279
+ ์—ฐ๊ฒฐ ์ข…๋ฃŒ
280
+ </button>
281
+ )}
282
+ </div>
283
+ </div>
284
+
285
+ {/* ์–ธ์–ด ์„ ํƒ */}
286
+ <div className="bg-white rounded-lg shadow-md p-6 mb-6">
287
+ <div className="flex items-center justify-between gap-4">
288
+ <div className="flex-1">
289
+ <label className="block text-sm font-medium mb-2">์ž…๋ ฅ ์–ธ์–ด</label>
290
+ <select
291
+ value={sourceLanguage}
292
+ onChange={(e) => setSourceLanguage(e.target.value)}
293
+ className="w-full p-2 border rounded-lg"
294
+ >
295
+ <option value="ko">ํ•œ๊ตญ์–ด</option>
296
+ <option value="en">์˜์–ด</option>
297
+ </select>
298
+ </div>
299
 
300
+ <button
301
+ onClick={swapLanguages}
302
+ className="mt-6 p-2 hover:bg-gray-100 rounded-lg"
303
+ >
304
+ <Globe className="w-6 h-6" />
305
+ </button>
306
 
307
+ <div className="flex-1">
308
+ <label className="block text-sm font-medium mb-2">์ถœ๋ ฅ ์–ธ์–ด</label>
309
+ <select
310
+ value={targetLanguage}
311
+ onChange={(e) => setTargetLanguage(e.target.value)}
312
+ className="w-full p-2 border rounded-lg"
313
+ >
314
+ <option value="en">์˜์–ด</option>
315
+ <option value="ko">ํ•œ๊ตญ์–ด</option>
316
+ </select>
317
+ </div>
318
+ </div>
319
+ </div>
320
+
321
+ {/* ๋…น์Œ ์ปจํŠธ๋กค */}
322
+ <div className="bg-white rounded-lg shadow-md p-6 mb-6">
323
+ <div className="flex flex-col items-center">
324
+ <button
325
+ onClick={toggleRecording}
326
+ disabled={!isConnected}
327
+ className={`p-6 rounded-full transition-colors ${
328
+ isRecording
329
+ ? 'bg-red-500 hover:bg-red-600'
330
+ : 'bg-blue-500 hover:bg-blue-600 disabled:bg-gray-400'
331
+ } text-white`}
332
+ >
333
+ {isRecording ? (
334
+ <MicOff className="w-8 h-8" />
335
+ ) : (
336
+ <Mic className="w-8 h-8" />
337
+ )}
338
+ </button>
339
+ <p className="mt-4 text-sm text-gray-600">
340
+ {isRecording ? '๋…น์Œ ์ค‘... ํด๋ฆญํ•˜์—ฌ ์ค‘์ง€' : 'ํด๋ฆญํ•˜์—ฌ ๋…น์Œ ์‹œ์ž‘'}
341
+ </p>
342
+ </div>
343
+ </div>
344
+
345
+ {/* ๊ฒฐ๊ณผ ํ‘œ์‹œ */}
346
+ <div className="bg-white rounded-lg shadow-md p-6 mb-6">
347
+ <h3 className="text-lg font-semibold mb-4">๋ฒˆ์—ญ ๊ฒฐ๊ณผ</h3>
348
+
349
+ <div className="space-y-4">
350
+ <div>
351
+ <label className="block text-sm font-medium mb-2">์›๋ณธ ํ…์ŠคํŠธ</label>
352
+ <div className="p-4 bg-gray-50 rounded-lg min-h-[80px]">
353
+ {transcript || <span className="text-gray-400">์Œ์„ฑ ์ž…๋ ฅ์„ ๊ธฐ๋‹ค๋ฆฌ๋Š” ์ค‘...</span>}
354
+ </div>
355
+ </div>
356
+
357
+ <div>
358
+ <label className="block text-sm font-medium mb-2">๋ฒˆ์—ญ๋œ ํ…์ŠคํŠธ</label>
359
+ <div className="p-4 bg-blue-50 rounded-lg min-h-[80px]">
360
+ {translation || <span className="text-gray-400">๋ฒˆ์—ญ ๊ฒฐ๊ณผ๊ฐ€ ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค...</span>}
361
+ </div>
362
+ </div>
363
+ </div>
364
+ </div>
365
+
366
+ {/* ๋กœ๊ทธ */}
367
+ <div className="bg-white rounded-lg shadow-md p-6">
368
+ <h3 className="text-lg font-semibold mb-4">ํ™œ๋™ ๋กœ๊ทธ</h3>
369
+ <div className="h-48 overflow-y-auto bg-gray-50 rounded-lg p-4 text-sm">
370
+ {logs.length === 0 ? (
371
+ <p className="text-gray-400">๋กœ๊ทธ๊ฐ€ ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค...</p>
372
+ ) : (
373
+ logs.map((log, index) => (
374
+ <div
375
+ key={index}
376
+ className={`mb-2 ${
377
+ log.type === 'error' ? 'text-red-600' :
378
+ log.type === 'success' ? 'text-green-600' :
379
+ log.type === 'warning' ? 'text-yellow-600' :
380
+ 'text-gray-700'
381
+ }`}
382
+ >
383
+ <span className="text-gray-500">[{log.timestamp}]</span> {log.message}
384
+ </div>
385
+ ))
386
+ )}
387
+ </div>
388
+ </div>
389
+
390
+ {/* ์˜ค๋””์˜ค ์—˜๋ฆฌ๋จผํŠธ (์ˆจ๊น€) */}
391
+ <audio ref={audioElRef} autoPlay style={{ display: 'none' }} />
392
+
393
+ {/* ์‚ฌ์šฉ ์•ˆ๋‚ด */}
394
+ <div className="mt-6 p-4 bg-yellow-50 rounded-lg">
395
+ <h4 className="font-semibold text-yellow-800 mb-2">โš ๏ธ ์ค‘์š” ์•ˆ๋‚ด</h4>
396
+ <ul className="text-sm text-yellow-700 space-y-1">
397
+ <li>โ€ข ์ด ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜์„ ์‚ฌ์šฉํ•˜๋ ค๋ฉด ์„œ๋ฒ„์—์„œ ephemeral key๋ฅผ ์ƒ์„ฑํ•˜๋Š” ์—”๋“œํฌ์ธํŠธ๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.</li>
398
+ <li>โ€ข OpenAI API ํ‚ค๊ฐ€ ํ•„์š”ํ•˜๋ฉฐ, ํด๋ผ์ด์–ธํŠธ ์ธก์—์„œ ์ง์ ‘ ์‚ฌ์šฉํ•˜๋ฉด ์•ˆ ๋ฉ๋‹ˆ๋‹ค.</li>
399
+ <li>โ€ข ์‹ค์ œ ๊ตฌํ˜„ ์‹œ ์ฃผ์„ ์ฒ˜๋ฆฌ๋œ API ์—ฐ๊ฒฐ ์ฝ”๋“œ๋ฅผ ํ™œ์„ฑํ™”ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.</li>
400
+ <li>โ€ข ๋งˆ์ดํฌ ๊ถŒํ•œ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.</li>
401
+ </ul>
402
+ </div>
403
+ </div>
404
+ </div>
405
+ );
406
+ };
407
+
408
+ export default RealtimeVoiceTranslator;