openfree commited on
Commit
b3067c5
ยท
verified ยท
1 Parent(s): 1efe32e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +363 -399
app.py CHANGED
@@ -1,408 +1,372 @@
1
- import React, { useState, useRef, useEffect } from 'react';
2
- import { Mic, MicOff, Volume2, Globe, Loader2 } from 'lucide-react';
3
-
4
- const RealtimeVoiceTranslator = () => {
5
- const [isConnected, setIsConnected] = useState(false);
6
- const [isRecording, setIsRecording] = useState(false);
7
- const [transcript, setTranscript] = useState('');
8
- const [translation, setTranslation] = useState('');
9
- const [sourceLanguage, setSourceLanguage] = useState('ko');
10
- const [targetLanguage, setTargetLanguage] = useState('en');
11
- const [ephemeralKey, setEphemeralKey] = useState('');
12
- const [logs, setLogs] = useState([]);
13
- const [isLoading, setIsLoading] = useState(false);
14
-
15
- const pcRef = useRef(null);
16
- const dcRef = useRef(null);
17
- const audioElRef = useRef(null);
18
-
19
- // ๋กœ๊ทธ ์ถ”๊ฐ€ ํ•จ์ˆ˜
20
- const addLog = (message, type = 'info') => {
21
- const timestamp = new Date().toLocaleTimeString();
22
- setLogs(prev => [...prev, { message, type, timestamp }]);
23
- };
24
-
25
- // Ephemeral key ๊ฐ€์ ธ์˜ค๊ธฐ (์‹œ๋ฎฌ๋ ˆ์ด์…˜)
26
- const getEphemeralKey = async () => {
27
- // ์‹ค์ œ ๊ตฌํ˜„์—์„œ๋Š” ์„œ๋ฒ„์—์„œ ephemeral key๋ฅผ ๊ฐ€์ ธ์™€์•ผ ํ•ฉ๋‹ˆ๋‹ค
28
- addLog('Ephemeral key๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค. ์„œ๋ฒ„ ์—”๋“œํฌ์ธํŠธ๋ฅผ ๊ตฌํ˜„ํ•ด์ฃผ์„ธ์š”.', 'warning');
29
- return 'YOUR_EPHEMERAL_KEY';
30
- };
31
-
32
- // WebRTC ์—ฐ๊ฒฐ ์ดˆ๊ธฐํ™”
33
- const initializeConnection = async () => {
34
- try {
35
- setIsLoading(true);
36
- addLog('์—ฐ๊ฒฐ ์ดˆ๊ธฐํ™” ์ค‘...', 'info');
37
-
38
- // Ephemeral key ๊ฐ€์ ธ์˜ค๊ธฐ
39
- const key = await getEphemeralKey();
40
- setEphemeralKey(key);
41
-
42
- // Peer connection ์ƒ์„ฑ
43
- const pc = new RTCPeerConnection({
44
- iceServers: [{ urls: 'stun:stun.l.google.com:19302' }]
45
- });
46
- pcRef.current = pc;
47
-
48
- // ์›๊ฒฉ ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์„ค์ •
49
- const audioEl = audioElRef.current;
50
- pc.ontrack = (e) => {
51
- audioEl.srcObject = e.streams[0];
52
- addLog('์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์—ฐ๊ฒฐ๋จ', 'success');
53
- };
54
-
55
- // ๋กœ์ปฌ ์˜ค๋””์˜ค ํŠธ๋ž™ ์ถ”๊ฐ€
56
- const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
57
- stream.getTracks().forEach(track => {
58
- pc.addTrack(track, stream);
59
- });
60
-
61
- // ๋ฐ์ดํ„ฐ ์ฑ„๋„ ์„ค์ •
62
- const dc = pc.createDataChannel('oai-events');
63
- dcRef.current = dc;
64
-
65
- dc.addEventListener('open', () => {
66
- addLog('๋ฐ์ดํ„ฐ ์ฑ„๋„ ์—ด๋ฆผ', 'success');
67
- setIsConnected(true);
68
 
69
- // ์„ธ์…˜ ์„ค์ •
70
- const sessionConfig = {
71
- type: 'session.update',
72
- session: {
73
- modalities: ['text', 'audio'],
74
- instructions: `You are a helpful translator. Translate between ${sourceLanguage === 'ko' ? 'Korean' : 'English'} and ${targetLanguage === 'ko' ? 'Korean' : 'English'}.`,
75
- voice: 'alloy',
76
- input_audio_format: 'pcm16',
77
- output_audio_format: 'pcm16',
78
- input_audio_transcription: {
79
- model: 'whisper-1'
80
- },
81
- turn_detection: {
82
- type: 'server_vad',
83
- threshold: 0.5,
84
- prefix_padding_ms: 300,
85
- silence_duration_ms: 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  }
87
- }
88
- };
89
- dc.send(JSON.stringify(sessionConfig));
90
- });
91
-
92
- dc.addEventListener('message', (e) => {
93
- const event = JSON.parse(e.data);
94
- handleRealtimeEvent(event);
95
- });
96
-
97
- dc.addEventListener('error', (error) => {
98
- addLog(`๋ฐ์ดํ„ฐ ์ฑ„๋„ ์˜ค๋ฅ˜: ${error}`, 'error');
99
- });
100
-
101
- // SDP ์˜คํผ ์ƒ์„ฑ ๋ฐ ์—ฐ๊ฒฐ
102
- const offer = await pc.createOffer();
103
- await pc.setLocalDescription(offer);
104
-
105
- // API ์—ฐ๊ฒฐ (์‹ค์ œ ๊ตฌํ˜„ ์‹œ ์‚ฌ์šฉ)
106
- /*
107
- const baseUrl = 'https://api.openai.com/v1/realtime';
108
- const model = 'gpt-4o-realtime-preview-2025-06-03';
109
- const response = await fetch(`${baseUrl}?model=${model}`, {
110
- method: 'POST',
111
- body: offer.sdp,
112
- headers: {
113
- 'Authorization': `Bearer ${key}`,
114
- 'Content-Type': 'application/sdp'
115
  }
116
- });
117
-
118
- const answerSdp = await response.text();
119
- const answer = {
120
- type: 'answer',
121
- sdp: answerSdp
122
- };
123
- await pc.setRemoteDescription(answer);
124
- */
125
-
126
- addLog('WebRTC ์—ฐ๊ฒฐ ์‹œ๋ฎฌ๋ ˆ์ด์…˜ ์™„๋ฃŒ', 'warning');
127
- setIsConnected(true);
128
- setIsLoading(false);
129
-
130
- } catch (error) {
131
- addLog(`์—ฐ๊ฒฐ ์˜ค๋ฅ˜: ${error.message}`, 'error');
132
- setIsLoading(false);
133
- }
134
- };
135
-
136
- // Realtime ์ด๋ฒคํŠธ ์ฒ˜๋ฆฌ
137
- const handleRealtimeEvent = (event) => {
138
- switch (event.type) {
139
- case 'conversation.item.created':
140
- if (event.item.role === 'user') {
141
- setTranscript(event.item.content?.[0]?.transcript || '');
142
- addLog(`์‚ฌ์šฉ์ž: ${event.item.content?.[0]?.transcript}`, 'info');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  }
144
- break;
145
-
146
- case 'conversation.item.input_audio_transcription.completed':
147
- setTranscript(event.transcript);
148
- addLog(`์ „์‚ฌ ์™„๋ฃŒ: ${event.transcript}`, 'info');
149
- // ๋ฒˆ์—ญ ์š”์ฒญ
150
- requestTranslation(event.transcript);
151
- break;
152
-
153
- case 'response.audio_transcript.delta':
154
- setTranslation(prev => prev + event.delta);
155
- break;
156
-
157
- case 'response.audio_transcript.done':
158
- addLog(`๋ฒˆ์—ญ ์™„๋ฃŒ: ${translation}`, 'success');
159
- break;
160
-
161
- case 'error':
162
- addLog(`์˜ค๋ฅ˜: ${event.error.message}`, 'error');
163
- break;
164
-
165
- default:
166
- console.log('Unhandled event:', event);
167
- }
168
- };
169
-
170
- // ๋ฒˆ์—ญ ์š”์ฒญ
171
- const requestTranslation = (text) => {
172
- if (!dcRef.current || dcRef.current.readyState !== 'open') return;
173
-
174
- const message = {
175
- type: 'conversation.item.create',
176
- item: {
177
- type: 'message',
178
- role: 'user',
179
- content: [{
180
- type: 'input_text',
181
- text: `Translate this to ${targetLanguage === 'ko' ? 'Korean' : 'English'}: "${text}"`
182
- }]
183
- }
184
- };
185
-
186
- dcRef.current.send(JSON.stringify(message));
187
 
188
- // ์‘๋‹ต ์ƒ์„ฑ ์š”์ฒญ
189
- dcRef.current.send(JSON.stringify({ type: 'response.create' }));
190
- };
191
-
192
- // ๋…น์Œ ์‹œ์ž‘/์ค‘์ง€
193
- const toggleRecording = () => {
194
- if (!isConnected) {
195
- addLog('๋จผ์ € ์—ฐ๊ฒฐ์„ ์‹œ์ž‘ํ•ด์ฃผ์„ธ์š”', 'warning');
196
- return;
197
- }
198
-
199
- setIsRecording(!isRecording);
 
 
 
 
 
 
 
 
 
 
200
 
201
- if (!isRecording) {
202
- addLog('๋…น์Œ ์‹œ์ž‘', 'info');
203
- // ์Œ์„ฑ ์ž…๋ ฅ ์‹œ์ž‘์„ ์œ„ํ•œ ์ด๋ฒคํŠธ ์ „์†ก
204
- if (dcRef.current && dcRef.current.readyState === 'open') {
205
- dcRef.current.send(JSON.stringify({
206
- type: 'input_audio_buffer.commit'
207
- }));
208
- }
209
- } else {
210
- addLog('๋…น์Œ ์ค‘์ง€', 'info');
211
- }
212
- };
213
-
214
- // ์–ธ์–ด ์ „ํ™˜
215
- const swapLanguages = () => {
216
- setSourceLanguage(targetLanguage);
217
- setTargetLanguage(sourceLanguage);
218
- addLog(`์–ธ์–ด ์ „ํ™˜: ${targetLanguage} โ†’ ${sourceLanguage}`, 'info');
219
- };
220
-
221
- // ์—ฐ๊ฒฐ ์ข…๋ฃŒ
222
- const disconnect = () => {
223
- if (pcRef.current) {
224
- pcRef.current.close();
225
- pcRef.current = null;
226
- }
227
- setIsConnected(false);
228
- setIsRecording(false);
229
- addLog('์—ฐ๊ฒฐ ์ข…๋ฃŒ๋จ', 'info');
230
- };
231
-
232
- // ์ปดํฌ๋„ŒํŠธ ์–ธ๋งˆ์šดํŠธ ์‹œ ์ •๋ฆฌ
233
- useEffect(() => {
234
- return () => {
235
- if (pcRef.current) {
236
- pcRef.current.close();
237
- }
238
- };
239
- }, []);
240
-
241
- return (
242
- <div className="min-h-screen bg-gray-50 p-4">
243
- <div className="max-w-4xl mx-auto">
244
- <h1 className="text-3xl font-bold text-center mb-8 text-gray-800">
245
- ์‹ค์‹œ๊ฐ„ ์Œ์„ฑ ๋ฒˆ์—ญ๊ธฐ
246
- </h1>
247
-
248
- {/* ์—ฐ๊ฒฐ ์ƒํƒœ */}
249
- <div className="bg-white rounded-lg shadow-md p-6 mb-6">
250
- <div className="flex items-center justify-between mb-4">
251
- <h2 className="text-xl font-semibold">์—ฐ๊ฒฐ ์ƒํƒœ</h2>
252
- <div className="flex items-center gap-2">
253
- <div className={`w-3 h-3 rounded-full ${isConnected ? 'bg-green-500' : 'bg-red-500'}`} />
254
- <span className="text-sm">{isConnected ? '์—ฐ๊ฒฐ๋จ' : '์—ฐ๊ฒฐ ์•ˆ๋จ'}</span>
255
- </div>
256
- </div>
257
-
258
- <div className="flex gap-4">
259
- {!isConnected ? (
260
- <button
261
- onClick={initializeConnection}
262
- disabled={isLoading}
263
- className="flex items-center gap-2 px-4 py-2 bg-blue-500 text-white rounded-lg hover:bg-blue-600 disabled:bg-gray-400"
264
- >
265
- {isLoading ? (
266
- <>
267
- <Loader2 className="w-4 h-4 animate-spin" />
268
- ์—ฐ๊ฒฐ ์ค‘...
269
- </>
270
- ) : (
271
- '์—ฐ๊ฒฐ ์‹œ์ž‘'
272
- )}
273
- </button>
274
- ) : (
275
- <button
276
- onClick={disconnect}
277
- className="px-4 py-2 bg-red-500 text-white rounded-lg hover:bg-red-600"
278
- >
279
- ์—ฐ๊ฒฐ ์ข…๋ฃŒ
280
- </button>
281
- )}
282
- </div>
283
- </div>
284
-
285
- {/* ์–ธ์–ด ์„ ํƒ */}
286
- <div className="bg-white rounded-lg shadow-md p-6 mb-6">
287
- <div className="flex items-center justify-between gap-4">
288
- <div className="flex-1">
289
- <label className="block text-sm font-medium mb-2">์ž…๋ ฅ ์–ธ์–ด</label>
290
- <select
291
- value={sourceLanguage}
292
- onChange={(e) => setSourceLanguage(e.target.value)}
293
- className="w-full p-2 border rounded-lg"
294
- >
295
- <option value="ko">ํ•œ๊ตญ์–ด</option>
296
- <option value="en">์˜์–ด</option>
297
- </select>
298
- </div>
299
 
300
- <button
301
- onClick={swapLanguages}
302
- className="mt-6 p-2 hover:bg-gray-100 rounded-lg"
303
- >
304
- <Globe className="w-6 h-6" />
305
- </button>
306
 
307
- <div className="flex-1">
308
- <label className="block text-sm font-medium mb-2">์ถœ๋ ฅ ์–ธ์–ด</label>
309
- <select
310
- value={targetLanguage}
311
- onChange={(e) => setTargetLanguage(e.target.value)}
312
- className="w-full p-2 border rounded-lg"
313
- >
314
- <option value="en">์˜์–ด</option>
315
- <option value="ko">ํ•œ๊ตญ์–ด</option>
316
- </select>
317
- </div>
318
- </div>
319
- </div>
320
-
321
- {/* ๋…น์Œ ์ปจํŠธ๋กค */}
322
- <div className="bg-white rounded-lg shadow-md p-6 mb-6">
323
- <div className="flex flex-col items-center">
324
- <button
325
- onClick={toggleRecording}
326
- disabled={!isConnected}
327
- className={`p-6 rounded-full transition-colors ${
328
- isRecording
329
- ? 'bg-red-500 hover:bg-red-600'
330
- : 'bg-blue-500 hover:bg-blue-600 disabled:bg-gray-400'
331
- } text-white`}
332
- >
333
- {isRecording ? (
334
- <MicOff className="w-8 h-8" />
335
- ) : (
336
- <Mic className="w-8 h-8" />
337
- )}
338
- </button>
339
- <p className="mt-4 text-sm text-gray-600">
340
- {isRecording ? '๋…น์Œ ์ค‘... ํด๋ฆญํ•˜์—ฌ ์ค‘์ง€' : 'ํด๋ฆญํ•˜์—ฌ ๋…น์Œ ์‹œ์ž‘'}
341
- </p>
342
- </div>
343
- </div>
344
-
345
- {/* ๊ฒฐ๊ณผ ํ‘œ์‹œ */}
346
- <div className="bg-white rounded-lg shadow-md p-6 mb-6">
347
- <h3 className="text-lg font-semibold mb-4">๋ฒˆ์—ญ ๊ฒฐ๊ณผ</h3>
348
-
349
- <div className="space-y-4">
350
- <div>
351
- <label className="block text-sm font-medium mb-2">์›๋ณธ ํ…์ŠคํŠธ</label>
352
- <div className="p-4 bg-gray-50 rounded-lg min-h-[80px]">
353
- {transcript || <span className="text-gray-400">์Œ์„ฑ ์ž…๋ ฅ์„ ๊ธฐ๋‹ค๋ฆฌ๋Š” ์ค‘...</span>}
354
- </div>
355
- </div>
356
 
357
- <div>
358
- <label className="block text-sm font-medium mb-2">๋ฒˆ์—ญ๋œ ํ…์ŠคํŠธ</label>
359
- <div className="p-4 bg-blue-50 rounded-lg min-h-[80px]">
360
- {translation || <span className="text-gray-400">๋ฒˆ์—ญ ๊ฒฐ๊ณผ๊ฐ€ ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค...</span>}
361
- </div>
362
- </div>
363
- </div>
364
- </div>
365
-
366
- {/* ๋กœ๊ทธ */}
367
- <div className="bg-white rounded-lg shadow-md p-6">
368
- <h3 className="text-lg font-semibold mb-4">ํ™œ๋™ ๋กœ๊ทธ</h3>
369
- <div className="h-48 overflow-y-auto bg-gray-50 rounded-lg p-4 text-sm">
370
- {logs.length === 0 ? (
371
- <p className="text-gray-400">๋กœ๊ทธ๊ฐ€ ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค...</p>
372
- ) : (
373
- logs.map((log, index) => (
374
- <div
375
- key={index}
376
- className={`mb-2 ${
377
- log.type === 'error' ? 'text-red-600' :
378
- log.type === 'success' ? 'text-green-600' :
379
- log.type === 'warning' ? 'text-yellow-600' :
380
- 'text-gray-700'
381
- }`}
382
- >
383
- <span className="text-gray-500">[{log.timestamp}]</span> {log.message}
384
- </div>
385
- ))
386
- )}
387
- </div>
388
- </div>
389
-
390
- {/* ์˜ค๋””๏ฟฝ๏ฟฝ๏ฟฝ ์—˜๋ฆฌ๋จผํŠธ (์ˆจ๊น€) */}
391
- <audio ref={audioElRef} autoPlay style={{ display: 'none' }} />
392
-
393
- {/* ์‚ฌ์šฉ ์•ˆ๋‚ด */}
394
- <div className="mt-6 p-4 bg-yellow-50 rounded-lg">
395
- <h4 className="font-semibold text-yellow-800 mb-2">โš ๏ธ ์ค‘์š” ์•ˆ๋‚ด</h4>
396
- <ul className="text-sm text-yellow-700 space-y-1">
397
- <li>โ€ข ์ด ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜์„ ์‚ฌ์šฉํ•˜๋ ค๋ฉด ์„œ๋ฒ„์—์„œ ephemeral key๋ฅผ ์ƒ์„ฑํ•˜๋Š” ์—”๋“œํฌ์ธํŠธ๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.</li>
398
- <li>โ€ข OpenAI API ํ‚ค๊ฐ€ ํ•„์š”ํ•˜๋ฉฐ, ํด๋ผ์ด์–ธํŠธ ์ธก์—์„œ ์ง์ ‘ ์‚ฌ์šฉํ•˜๋ฉด ์•ˆ ๋ฉ๋‹ˆ๋‹ค.</li>
399
- <li>โ€ข ์‹ค์ œ ๊ตฌํ˜„ ์‹œ ์ฃผ์„ ์ฒ˜๋ฆฌ๋œ API ์—ฐ๊ฒฐ ์ฝ”๋“œ๋ฅผ ํ™œ์„ฑํ™”ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.</li>
400
- <li>โ€ข ๋งˆ์ดํฌ ๊ถŒํ•œ์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค.</li>
401
- </ul>
402
- </div>
403
- </div>
404
- </div>
405
- );
406
- };
407
-
408
- export default RealtimeVoiceTranslator;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import websocket
3
+ import json
4
+ import base64
5
+ import numpy as np
6
+ import threading
7
+ import queue
8
+ import os
9
+ from datetime import datetime
10
+ import pyaudio
11
+ import wave
12
+ import io
13
+
14
+ class RealtimeTranslator:
15
+ def __init__(self):
16
+ self.ws = None
17
+ self.api_key = os.getenv("OPENAI_API_KEY")
18
+ self.audio_queue = queue.Queue()
19
+ self.transcript_queue = queue.Queue()
20
+ self.translation_queue = queue.Queue()
21
+ self.is_connected = False
22
+ self.is_recording = False
23
+ self.source_lang = "ko"
24
+ self.target_lang = "en"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # PyAudio ์„ค์ •
27
+ self.p = pyaudio.PyAudio()
28
+ self.sample_rate = 24000
29
+ self.chunk_size = 1024
30
+ self.audio_format = pyaudio.paInt16
31
+
32
+ def connect_websocket(self):
33
+ """WebSocket ์—ฐ๊ฒฐ ์„ค์ •"""
34
+ try:
35
+ url = "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-12-17"
36
+ headers = {
37
+ "Authorization": f"Bearer {self.api_key}",
38
+ "OpenAI-Beta": "realtime=v1"
39
+ }
40
+
41
+ self.ws = websocket.WebSocketApp(
42
+ url,
43
+ header=headers,
44
+ on_open=self.on_open,
45
+ on_message=self.on_message,
46
+ on_error=self.on_error,
47
+ on_close=self.on_close
48
+ )
49
+
50
+ # WebSocket์„ ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ ์‹คํ–‰
51
+ wst = threading.Thread(target=self.ws.run_forever)
52
+ wst.daemon = True
53
+ wst.start()
54
+
55
+ return "์—ฐ๊ฒฐ ์„ฑ๊ณต"
56
+ except Exception as e:
57
+ return f"์—ฐ๊ฒฐ ์‹คํŒจ: {str(e)}"
58
+
59
+ def on_open(self, ws):
60
+ """WebSocket ์—ฐ๊ฒฐ ์‹œ ํ˜ธ์ถœ"""
61
+ self.is_connected = True
62
+ print("WebSocket ์—ฐ๊ฒฐ๋จ")
63
+
64
+ # ์„ธ๏ฟฝ๏ฟฝ ์„ค์ •
65
+ session_update = {
66
+ "type": "session.update",
67
+ "session": {
68
+ "modalities": ["text", "audio"],
69
+ "instructions": f"You are a helpful translator. Translate between {self.get_language_name(self.source_lang)} and {self.get_language_name(self.target_lang)}. Respond with both the transcription and translation.",
70
+ "voice": "alloy",
71
+ "input_audio_format": "pcm16",
72
+ "output_audio_format": "pcm16",
73
+ "input_audio_transcription": {
74
+ "model": "whisper-1"
75
+ },
76
+ "turn_detection": {
77
+ "type": "server_vad",
78
+ "threshold": 0.5,
79
+ "prefix_padding_ms": 300,
80
+ "silence_duration_ms": 500
81
+ }
82
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  }
84
+ ws.send(json.dumps(session_update))
85
+
86
+ def on_message(self, ws, message):
87
+ """๋ฉ”์‹œ์ง€ ์ˆ˜์‹  ์‹œ ํ˜ธ์ถœ"""
88
+ try:
89
+ event = json.loads(message)
90
+ event_type = event.get("type")
91
+
92
+ if event_type == "conversation.item.input_audio_transcription.completed":
93
+ # ์Œ์„ฑ ์ „์‚ฌ ์™„๋ฃŒ
94
+ transcript = event.get("transcript", "")
95
+ self.transcript_queue.put(transcript)
96
+
97
+ # ๋ฒˆ์—ญ ์š”์ฒญ
98
+ self.request_translation(transcript)
99
+
100
+ elif event_type == "response.text.delta":
101
+ # ๋ฒˆ์—ญ ๊ฒฐ๊ณผ ์ˆ˜์‹ 
102
+ delta = event.get("delta", "")
103
+ self.translation_queue.put(delta)
104
+
105
+ elif event_type == "response.audio.delta":
106
+ # ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ์ˆ˜์‹ 
107
+ audio_data = base64.b64decode(event.get("delta", ""))
108
+ self.audio_queue.put(audio_data)
109
+
110
+ elif event_type == "error":
111
+ error_msg = event.get("error", {}).get("message", "Unknown error")
112
+ print(f"Error: {error_msg}")
113
+
114
+ except Exception as e:
115
+ print(f"๋ฉ”์‹œ์ง€ ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {str(e)}")
116
+
117
+ def on_error(self, ws, error):
118
+ """์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ํ˜ธ์ถœ"""
119
+ print(f"WebSocket ์˜ค๋ฅ˜: {error}")
120
+ self.is_connected = False
121
+
122
+ def on_close(self, ws, close_status_code, close_msg):
123
+ """์—ฐ๊ฒฐ ์ข…๋ฃŒ ์‹œ ํ˜ธ์ถœ"""
124
+ print("WebSocket ์—ฐ๊ฒฐ ์ข…๋ฃŒ")
125
+ self.is_connected = False
126
+
127
+ def get_language_name(self, lang_code):
128
+ """์–ธ์–ด ์ฝ”๋“œ๋ฅผ ์–ธ์–ด ์ด๋ฆ„์œผ๋กœ ๋ณ€ํ™˜"""
129
+ languages = {
130
+ "ko": "Korean",
131
+ "en": "English",
132
+ "ja": "Japanese",
133
+ "zh": "Chinese",
134
+ "es": "Spanish",
135
+ "fr": "French"
136
  }
137
+ return languages.get(lang_code, lang_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
+ def request_translation(self, text):
140
+ """๋ฒˆ์—ญ ์š”์ฒญ"""
141
+ if not self.ws or not self.is_connected:
142
+ return
143
+
144
+ message = {
145
+ "type": "conversation.item.create",
146
+ "item": {
147
+ "type": "message",
148
+ "role": "user",
149
+ "content": [{
150
+ "type": "input_text",
151
+ "text": f"Translate this {self.get_language_name(self.source_lang)} text to {self.get_language_name(self.target_lang)}: '{text}'"
152
+ }]
153
+ }
154
+ }
155
+
156
+ self.ws.send(json.dumps(message))
157
+
158
+ # ์‘๋‹ต ์ƒ์„ฑ ์š”์ฒญ
159
+ response_create = {"type": "response.create"}
160
+ self.ws.send(json.dumps(response_create))
161
 
162
+ def send_audio_chunk(self, audio_data):
163
+ """์˜ค๋””์˜ค ์ฒญํฌ ์ „์†ก"""
164
+ if not self.ws or not self.is_connected:
165
+ return
166
+
167
+ # PCM16 ํ˜•์‹์œผ๋กœ ์ธ์ฝ”๋”ฉ
168
+ audio_base64 = base64.b64encode(audio_data).decode('utf-8')
169
+
170
+ message = {
171
+ "type": "input_audio_buffer.append",
172
+ "audio": audio_base64
173
+ }
174
+
175
+ self.ws.send(json.dumps(message))
176
+
177
+ def process_audio(self, audio_file):
178
+ """์˜ค๋””์˜ค ํŒŒ์ผ ์ฒ˜๋ฆฌ ๋ฐ ์ „์†ก"""
179
+ if not self.is_connected:
180
+ return "WebSocket์ด ์—ฐ๊ฒฐ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.", ""
181
+
182
+ try:
183
+ # ์˜ค๋””์˜ค ํŒŒ์ผ ์ฝ๊ธฐ
184
+ with wave.open(audio_file, 'rb') as wf:
185
+ # ์˜ค๋””์˜ค๋ฅผ 24kHz PCM16์œผ๋กœ ๋ณ€ํ™˜ ํ•„์š”
186
+ audio_data = wf.readframes(wf.getnframes())
187
+
188
+ # ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ๋ฅผ ์ฒญํฌ๋กœ ๋‚˜๋ˆ„์–ด ์ „์†ก
189
+ chunk_size = 4096
190
+ for i in range(0, len(audio_data), chunk_size):
191
+ chunk = audio_data[i:i+chunk_size]
192
+ self.send_audio_chunk(chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ # ์˜ค๋””์˜ค ๋ฒ„ํผ ์ปค๋ฐ‹
195
+ commit_message = {"type": "input_audio_buffer.commit"}
196
+ self.ws.send(json.dumps(commit_message))
 
 
 
197
 
198
+ # ์ „์‚ฌ ๋ฐ ๋ฒˆ์—ญ ๊ฒฐ๊ณผ ๋Œ€๊ธฐ
199
+ transcript = ""
200
+ translation = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
+ # ํƒ€์ž„์•„์›ƒ ์„ค์ • (10์ดˆ)
203
+ import time
204
+ timeout = 10
205
+ start_time = time.time()
206
+
207
+ while time.time() - start_time < timeout:
208
+ # ์ „์‚ฌ ๊ฒฐ๊ณผ ํ™•์ธ
209
+ try:
210
+ transcript = self.transcript_queue.get(timeout=0.1)
211
+ except queue.Empty:
212
+ pass
213
+
214
+ # ๋ฒˆ์—ญ ๊ฒฐ๊ณผ ํ™•์ธ
215
+ try:
216
+ while not self.translation_queue.empty():
217
+ translation += self.translation_queue.get()
218
+ except queue.Empty:
219
+ pass
220
+
221
+ if transcript and translation:
222
+ break
223
+
224
+ return transcript, translation
225
+
226
+ except Exception as e:
227
+ return f"์˜ค๋ฅ˜: {str(e)}", ""
228
+
229
+ def disconnect(self):
230
+ """WebSocket ์—ฐ๊ฒฐ ์ข…๋ฃŒ"""
231
+ if self.ws:
232
+ self.ws.close()
233
+ self.is_connected = False
234
+ return "์—ฐ๊ฒฐ ์ข…๋ฃŒ๋จ"
235
+
236
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ
237
+ def create_interface():
238
+ translator = RealtimeTranslator()
239
+
240
+ def connect():
241
+ if not translator.api_key:
242
+ return "API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ OPENAI_API_KEY๋ฅผ ์„ค์ •ํ•˜์„ธ์š”.", gr.update(value=False)
243
+ result = translator.connect_websocket()
244
+ return result, gr.update(value=translator.is_connected)
245
+
246
+ def disconnect():
247
+ result = translator.disconnect()
248
+ return result, gr.update(value=False)
249
+
250
+ def translate_audio(audio_file, source_lang, target_lang):
251
+ if not audio_file:
252
+ return "์˜ค๋””์˜ค ํŒŒ์ผ์„ ์„ ํƒํ•˜์„ธ์š”.", "", None
253
+
254
+ translator.source_lang = source_lang
255
+ translator.target_lang = target_lang
256
+
257
+ transcript, translation = translator.process_audio(audio_file)
258
+
259
+ # ์˜ค๋””์˜ค ์‘๋‹ต ์ฒ˜๋ฆฌ (ํ˜„์žฌ๋Š” ํ…์ŠคํŠธ๋งŒ ๋ฐ˜ํ™˜)
260
+ return transcript, translation, None
261
+
262
+ def swap_languages(source, target):
263
+ return target, source
264
+
265
+ with gr.Blocks(title="์‹ค์‹œ๊ฐ„ ์Œ์„ฑ ๋ฒˆ์—ญ๊ธฐ") as demo:
266
+ gr.Markdown("# ๐ŸŽ™๏ธ OpenAI Realtime API ์Œ์„ฑ ๋ฒˆ์—ญ๊ธฐ")
267
+ gr.Markdown("์‹ค์‹œ๊ฐ„์œผ๋กœ ์Œ์„ฑ์„ ์ „์‚ฌํ•˜๊ณ  ๋ฒˆ์—ญํ•ฉ๋‹ˆ๋‹ค.")
268
+
269
+ with gr.Row():
270
+ with gr.Column(scale=1):
271
+ gr.Markdown("### ์—ฐ๊ฒฐ ์ƒํƒœ")
272
+ connection_status = gr.Checkbox(label="์—ฐ๊ฒฐ๋จ", value=False, interactive=False)
273
+ connect_btn = gr.Button("์—ฐ๊ฒฐ", variant="primary")
274
+ disconnect_btn = gr.Button("์—ฐ๊ฒฐ ์ข…๋ฃŒ", variant="secondary")
275
+ status_text = gr.Textbox(label="์ƒํƒœ ๋ฉ”์‹œ์ง€", value="์—ฐ๊ฒฐ๋˜์ง€ ์•Š์Œ")
276
+
277
+ with gr.Row():
278
+ with gr.Column(scale=2):
279
+ gr.Markdown("### ์–ธ์–ด ์„ค์ •")
280
+ with gr.Row():
281
+ source_lang = gr.Dropdown(
282
+ choices=[("ํ•œ๊ตญ์–ด", "ko"), ("์˜์–ด", "en"), ("์ผ๋ณธ์–ด", "ja"),
283
+ ("์ค‘๊ตญ์–ด", "zh"), ("์ŠคํŽ˜์ธ์–ด", "es"), ("ํ”„๋ž‘์Šค์–ด", "fr")],
284
+ value="ko",
285
+ label="์ž…๋ ฅ ์–ธ์–ด"
286
+ )
287
+ swap_btn = gr.Button("โ†”๏ธ", scale=0)
288
+ target_lang = gr.Dropdown(
289
+ choices=[("ํ•œ๊ตญ์–ด", "ko"), ("์˜์–ด", "en"), ("์ผ๋ณธ์–ด", "ja"),
290
+ ("์ค‘๊ตญ์–ด", "zh"), ("์ŠคํŽ˜์ธ์–ด", "es"), ("ํ”„๋ž‘์Šค์–ด", "fr")],
291
+ value="en",
292
+ label="์ถœ๋ ฅ ์–ธ์–ด"
293
+ )
294
+
295
+ with gr.Row():
296
+ with gr.Column():
297
+ gr.Markdown("### ์Œ์„ฑ ์ž…๋ ฅ")
298
+ audio_input = gr.Audio(
299
+ source="microphone",
300
+ type="filepath",
301
+ label="๋…น์Œํ•˜๊ธฐ"
302
+ )
303
+ translate_btn = gr.Button("๋ฒˆ์—ญํ•˜๊ธฐ", variant="primary")
304
+
305
+ with gr.Row():
306
+ with gr.Column():
307
+ gr.Markdown("### ๊ฒฐ๊ณผ")
308
+ transcript_output = gr.Textbox(
309
+ label="์ „์‚ฌ๋œ ํ…์ŠคํŠธ",
310
+ placeholder="์Œ์„ฑ ์ „์‚ฌ ๊ฒฐ๊ณผ๊ฐ€ ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค...",
311
+ lines=3
312
+ )
313
+ translation_output = gr.Textbox(
314
+ label="๋ฒˆ์—ญ๋œ ํ…์ŠคํŠธ",
315
+ placeholder="๋ฒˆ์—ญ ๊ฒฐ๊ณผ๊ฐ€ ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค...",
316
+ lines=3
317
+ )
318
+ audio_output = gr.Audio(
319
+ label="๋ฒˆ์—ญ๋œ ์Œ์„ฑ",
320
+ type="filepath"
321
+ )
322
+
323
+ # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
324
+ connect_btn.click(
325
+ fn=connect,
326
+ outputs=[status_text, connection_status]
327
+ )
328
+
329
+ disconnect_btn.click(
330
+ fn=disconnect,
331
+ outputs=[status_text, connection_status]
332
+ )
333
+
334
+ swap_btn.click(
335
+ fn=swap_languages,
336
+ inputs=[source_lang, target_lang],
337
+ outputs=[source_lang, target_lang]
338
+ )
339
+
340
+ translate_btn.click(
341
+ fn=translate_audio,
342
+ inputs=[audio_input, source_lang, target_lang],
343
+ outputs=[transcript_output, translation_output, audio_output]
344
+ )
345
+
346
+ gr.Markdown("""
347
+ ### ๐Ÿ“ ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
348
+ 1. **์—ฐ๊ฒฐ** ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜์—ฌ OpenAI Realtime API์— ์—ฐ๊ฒฐํ•ฉ๋‹ˆ๋‹ค.
349
+ 2. ์ž…๋ ฅ ์–ธ์–ด์™€ ์ถœ๋ ฅ ์–ธ์–ด๋ฅผ ์„ ํƒํ•ฉ๋‹ˆ๋‹ค.
350
+ 3. ๋งˆ์ดํฌ ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜์—ฌ ์Œ์„ฑ์„ ๋…น์Œํ•ฉ๋‹ˆ๋‹ค.
351
+ 4. **๋ฒˆ์—ญํ•˜๊ธฐ** ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜๋ฉด ์ „์‚ฌ ๋ฐ ๋ฒˆ์—ญ์ด ์ง„ํ–‰๋ฉ๋‹ˆ๋‹ค.
352
+
353
+ ### โš ๏ธ ์ฃผ์˜์‚ฌํ•ญ
354
+ - ํ™˜๊ฒฝ ๋ณ€์ˆ˜ `OPENAI_API_KEY`๊ฐ€ ์„ค์ •๋˜์–ด ์žˆ์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
355
+ - ๊ธด ์˜ค๋””์˜ค์˜ ๊ฒฝ์šฐ ์ฒ˜๋ฆฌ ์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
356
+ """)
357
+
358
+ return demo
359
+
360
+ # ์‹คํ–‰
361
+ if __name__ == "__main__":
362
+ # ํ•„์š”ํ•œ ํŒจํ‚ค์ง€ ์„ค์น˜ ์•ˆ๋‚ด
363
+ print("""
364
+ ํ•„์š”ํ•œ ํŒจํ‚ค์ง€:
365
+ pip install gradio websocket-client pyaudio wave numpy
366
+
367
+ ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ •:
368
+ export OPENAI_API_KEY="your-api-key-here"
369
+ """)
370
+
371
+ demo = create_interface()
372
+ demo.launch(share=True)