Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import websocket | |
import json | |
import base64 | |
import numpy as np | |
import threading | |
import queue | |
import os | |
from datetime import datetime | |
import pyaudio | |
import wave | |
import io | |
class RealtimeTranslator: | |
def __init__(self): | |
self.ws = None | |
self.api_key = os.getenv("OPENAI_API_KEY") | |
self.audio_queue = queue.Queue() | |
self.transcript_queue = queue.Queue() | |
self.translation_queue = queue.Queue() | |
self.is_connected = False | |
self.is_recording = False | |
self.source_lang = "ko" | |
self.target_lang = "en" | |
# PyAudio ์ค์ | |
self.p = pyaudio.PyAudio() | |
self.sample_rate = 24000 | |
self.chunk_size = 1024 | |
self.audio_format = pyaudio.paInt16 | |
def connect_websocket(self): | |
"""WebSocket ์ฐ๊ฒฐ ์ค์ """ | |
try: | |
url = "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-12-17" | |
headers = { | |
"Authorization": f"Bearer {self.api_key}", | |
"OpenAI-Beta": "realtime=v1" | |
} | |
self.ws = websocket.WebSocketApp( | |
url, | |
header=headers, | |
on_open=self.on_open, | |
on_message=self.on_message, | |
on_error=self.on_error, | |
on_close=self.on_close | |
) | |
# WebSocket์ ๋ณ๋ ์ค๋ ๋์์ ์คํ | |
wst = threading.Thread(target=self.ws.run_forever) | |
wst.daemon = True | |
wst.start() | |
return "์ฐ๊ฒฐ ์ฑ๊ณต" | |
except Exception as e: | |
return f"์ฐ๊ฒฐ ์คํจ: {str(e)}" | |
def on_open(self, ws): | |
"""WebSocket ์ฐ๊ฒฐ ์ ํธ์ถ""" | |
self.is_connected = True | |
print("WebSocket ์ฐ๊ฒฐ๋จ") | |
# ์ธ์ ์ค์ | |
session_update = { | |
"type": "session.update", | |
"session": { | |
"modalities": ["text", "audio"], | |
"instructions": f"You are a helpful translator. Translate between {self.get_language_name(self.source_lang)} and {self.get_language_name(self.target_lang)}. Respond with both the transcription and translation.", | |
"voice": "alloy", | |
"input_audio_format": "pcm16", | |
"output_audio_format": "pcm16", | |
"input_audio_transcription": { | |
"model": "whisper-1" | |
}, | |
"turn_detection": { | |
"type": "server_vad", | |
"threshold": 0.5, | |
"prefix_padding_ms": 300, | |
"silence_duration_ms": 500 | |
} | |
} | |
} | |
ws.send(json.dumps(session_update)) | |
def on_message(self, ws, message): | |
"""๋ฉ์์ง ์์ ์ ํธ์ถ""" | |
try: | |
event = json.loads(message) | |
event_type = event.get("type") | |
if event_type == "conversation.item.input_audio_transcription.completed": | |
# ์์ฑ ์ ์ฌ ์๋ฃ | |
transcript = event.get("transcript", "") | |
self.transcript_queue.put(transcript) | |
# ๋ฒ์ญ ์์ฒญ | |
self.request_translation(transcript) | |
elif event_type == "response.text.delta": | |
# ๋ฒ์ญ ๊ฒฐ๊ณผ ์์ | |
delta = event.get("delta", "") | |
self.translation_queue.put(delta) | |
elif event_type == "response.audio.delta": | |
# ์ค๋์ค ๋ฐ์ดํฐ ์์ | |
audio_data = base64.b64decode(event.get("delta", "")) | |
self.audio_queue.put(audio_data) | |
elif event_type == "error": | |
error_msg = event.get("error", {}).get("message", "Unknown error") | |
print(f"Error: {error_msg}") | |
except Exception as e: | |
print(f"๋ฉ์์ง ์ฒ๋ฆฌ ์ค๋ฅ: {str(e)}") | |
def on_error(self, ws, error): | |
"""์ค๋ฅ ๋ฐ์ ์ ํธ์ถ""" | |
print(f"WebSocket ์ค๋ฅ: {error}") | |
self.is_connected = False | |
def on_close(self, ws, close_status_code, close_msg): | |
"""์ฐ๊ฒฐ ์ข ๋ฃ ์ ํธ์ถ""" | |
print("WebSocket ์ฐ๊ฒฐ ์ข ๋ฃ") | |
self.is_connected = False | |
def get_language_name(self, lang_code): | |
"""์ธ์ด ์ฝ๋๋ฅผ ์ธ์ด ์ด๋ฆ์ผ๋ก ๋ณํ""" | |
languages = { | |
"ko": "Korean", | |
"en": "English", | |
"ja": "Japanese", | |
"zh": "Chinese", | |
"es": "Spanish", | |
"fr": "French" | |
} | |
return languages.get(lang_code, lang_code) | |
def request_translation(self, text): | |
"""๋ฒ์ญ ์์ฒญ""" | |
if not self.ws or not self.is_connected: | |
return | |
message = { | |
"type": "conversation.item.create", | |
"item": { | |
"type": "message", | |
"role": "user", | |
"content": [{ | |
"type": "input_text", | |
"text": f"Translate this {self.get_language_name(self.source_lang)} text to {self.get_language_name(self.target_lang)}: '{text}'" | |
}] | |
} | |
} | |
self.ws.send(json.dumps(message)) | |
# ์๋ต ์์ฑ ์์ฒญ | |
response_create = {"type": "response.create"} | |
self.ws.send(json.dumps(response_create)) | |
def send_audio_chunk(self, audio_data): | |
"""์ค๋์ค ์ฒญํฌ ์ ์ก""" | |
if not self.ws or not self.is_connected: | |
return | |
# PCM16 ํ์์ผ๋ก ์ธ์ฝ๋ฉ | |
audio_base64 = base64.b64encode(audio_data).decode('utf-8') | |
message = { | |
"type": "input_audio_buffer.append", | |
"audio": audio_base64 | |
} | |
self.ws.send(json.dumps(message)) | |
def process_audio(self, audio_file): | |
"""์ค๋์ค ํ์ผ ์ฒ๋ฆฌ ๋ฐ ์ ์ก""" | |
if not self.is_connected: | |
return "WebSocket์ด ์ฐ๊ฒฐ๋์ง ์์์ต๋๋ค.", "" | |
try: | |
# ์ค๋์ค ํ์ผ ์ฝ๊ธฐ | |
with wave.open(audio_file, 'rb') as wf: | |
# ์ค๋์ค๋ฅผ 24kHz PCM16์ผ๋ก ๋ณํ ํ์ | |
audio_data = wf.readframes(wf.getnframes()) | |
# ์ค๋์ค ๋ฐ์ดํฐ๋ฅผ ์ฒญํฌ๋ก ๋๋์ด ์ ์ก | |
chunk_size = 4096 | |
for i in range(0, len(audio_data), chunk_size): | |
chunk = audio_data[i:i+chunk_size] | |
self.send_audio_chunk(chunk) | |
# ์ค๋์ค ๋ฒํผ ์ปค๋ฐ | |
commit_message = {"type": "input_audio_buffer.commit"} | |
self.ws.send(json.dumps(commit_message)) | |
# ์ ์ฌ ๋ฐ ๋ฒ์ญ ๊ฒฐ๊ณผ ๋๊ธฐ | |
transcript = "" | |
translation = "" | |
# ํ์์์ ์ค์ (10์ด) | |
import time | |
timeout = 10 | |
start_time = time.time() | |
while time.time() - start_time < timeout: | |
# ์ ์ฌ ๊ฒฐ๊ณผ ํ์ธ | |
try: | |
transcript = self.transcript_queue.get(timeout=0.1) | |
except queue.Empty: | |
pass | |
# ๋ฒ์ญ ๊ฒฐ๊ณผ ํ์ธ | |
try: | |
while not self.translation_queue.empty(): | |
translation += self.translation_queue.get() | |
except queue.Empty: | |
pass | |
if transcript and translation: | |
break | |
return transcript, translation | |
except Exception as e: | |
return f"์ค๋ฅ: {str(e)}", "" | |
def disconnect(self): | |
"""WebSocket ์ฐ๊ฒฐ ์ข ๋ฃ""" | |
if self.ws: | |
self.ws.close() | |
self.is_connected = False | |
return "์ฐ๊ฒฐ ์ข ๋ฃ๋จ" | |
# Gradio ์ธํฐํ์ด์ค ์์ฑ | |
def create_interface(): | |
translator = RealtimeTranslator() | |
def connect(): | |
if not translator.api_key: | |
return "API ํค๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. ํ๊ฒฝ ๋ณ์ OPENAI_API_KEY๋ฅผ ์ค์ ํ์ธ์.", gr.update(value=False) | |
result = translator.connect_websocket() | |
return result, gr.update(value=translator.is_connected) | |
def disconnect(): | |
result = translator.disconnect() | |
return result, gr.update(value=False) | |
def translate_audio(audio_file, source_lang, target_lang): | |
if not audio_file: | |
return "์ค๋์ค ํ์ผ์ ์ ํํ์ธ์.", "", None | |
translator.source_lang = source_lang | |
translator.target_lang = target_lang | |
transcript, translation = translator.process_audio(audio_file) | |
# ์ค๋์ค ์๋ต ์ฒ๋ฆฌ (ํ์ฌ๋ ํ ์คํธ๋ง ๋ฐํ) | |
return transcript, translation, None | |
def swap_languages(source, target): | |
return target, source | |
with gr.Blocks(title="์ค์๊ฐ ์์ฑ ๋ฒ์ญ๊ธฐ") as demo: | |
gr.Markdown("# ๐๏ธ OpenAI Realtime API ์์ฑ ๋ฒ์ญ๊ธฐ") | |
gr.Markdown("์ค์๊ฐ์ผ๋ก ์์ฑ์ ์ ์ฌํ๊ณ ๋ฒ์ญํฉ๋๋ค.") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### ์ฐ๊ฒฐ ์ํ") | |
connection_status = gr.Checkbox(label="์ฐ๊ฒฐ๋จ", value=False, interactive=False) | |
connect_btn = gr.Button("์ฐ๊ฒฐ", variant="primary") | |
disconnect_btn = gr.Button("์ฐ๊ฒฐ ์ข ๋ฃ", variant="secondary") | |
status_text = gr.Textbox(label="์ํ ๋ฉ์์ง", value="์ฐ๊ฒฐ๋์ง ์์") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
gr.Markdown("### ์ธ์ด ์ค์ ") | |
with gr.Row(): | |
source_lang = gr.Dropdown( | |
choices=[("ํ๊ตญ์ด", "ko"), ("์์ด", "en"), ("์ผ๋ณธ์ด", "ja"), | |
("์ค๊ตญ์ด", "zh"), ("์คํ์ธ์ด", "es"), ("ํ๋์ค์ด", "fr")], | |
value="ko", | |
label="์ ๋ ฅ ์ธ์ด" | |
) | |
swap_btn = gr.Button("โ๏ธ", scale=0) | |
target_lang = gr.Dropdown( | |
choices=[("ํ๊ตญ์ด", "ko"), ("์์ด", "en"), ("์ผ๋ณธ์ด", "ja"), | |
("์ค๊ตญ์ด", "zh"), ("์คํ์ธ์ด", "es"), ("ํ๋์ค์ด", "fr")], | |
value="en", | |
label="์ถ๋ ฅ ์ธ์ด" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### ์์ฑ ์ ๋ ฅ") | |
audio_input = gr.Audio( | |
source="microphone", | |
type="filepath", | |
label="๋ น์ํ๊ธฐ" | |
) | |
translate_btn = gr.Button("๋ฒ์ญํ๊ธฐ", variant="primary") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### ๊ฒฐ๊ณผ") | |
transcript_output = gr.Textbox( | |
label="์ ์ฌ๋ ํ ์คํธ", | |
placeholder="์์ฑ ์ ์ฌ ๊ฒฐ๊ณผ๊ฐ ์ฌ๊ธฐ์ ํ์๋ฉ๋๋ค...", | |
lines=3 | |
) | |
translation_output = gr.Textbox( | |
label="๋ฒ์ญ๋ ํ ์คํธ", | |
placeholder="๋ฒ์ญ ๊ฒฐ๊ณผ๊ฐ ์ฌ๊ธฐ์ ํ์๋ฉ๋๋ค...", | |
lines=3 | |
) | |
audio_output = gr.Audio( | |
label="๋ฒ์ญ๋ ์์ฑ", | |
type="filepath" | |
) | |
# ์ด๋ฒคํธ ํธ๋ค๋ฌ | |
connect_btn.click( | |
fn=connect, | |
outputs=[status_text, connection_status] | |
) | |
disconnect_btn.click( | |
fn=disconnect, | |
outputs=[status_text, connection_status] | |
) | |
swap_btn.click( | |
fn=swap_languages, | |
inputs=[source_lang, target_lang], | |
outputs=[source_lang, target_lang] | |
) | |
translate_btn.click( | |
fn=translate_audio, | |
inputs=[audio_input, source_lang, target_lang], | |
outputs=[transcript_output, translation_output, audio_output] | |
) | |
gr.Markdown(""" | |
### ๐ ์ฌ์ฉ ๋ฐฉ๋ฒ | |
1. **์ฐ๊ฒฐ** ๋ฒํผ์ ํด๋ฆญํ์ฌ OpenAI Realtime API์ ์ฐ๊ฒฐํฉ๋๋ค. | |
2. ์ ๋ ฅ ์ธ์ด์ ์ถ๋ ฅ ์ธ์ด๋ฅผ ์ ํํฉ๋๋ค. | |
3. ๋ง์ดํฌ ๋ฒํผ์ ํด๋ฆญํ์ฌ ์์ฑ์ ๋ น์ํฉ๋๋ค. | |
4. **๋ฒ์ญํ๊ธฐ** ๋ฒํผ์ ํด๋ฆญํ๋ฉด ์ ์ฌ ๋ฐ ๋ฒ์ญ์ด ์งํ๋ฉ๋๋ค. | |
### โ ๏ธ ์ฃผ์์ฌํญ | |
- ํ๊ฒฝ ๋ณ์ `OPENAI_API_KEY`๊ฐ ์ค์ ๋์ด ์์ด์ผ ํฉ๋๋ค. | |
- ๊ธด ์ค๋์ค์ ๊ฒฝ์ฐ ์ฒ๋ฆฌ ์๊ฐ์ด ์ค๋ ๊ฑธ๋ฆด ์ ์์ต๋๋ค. | |
""") | |
return demo | |
# ์คํ | |
if __name__ == "__main__": | |
# ํ์ํ ํจํค์ง ์ค์น ์๋ด | |
print(""" | |
ํ์ํ ํจํค์ง: | |
pip install gradio websocket-client pyaudio wave numpy | |
ํ๊ฒฝ ๋ณ์ ์ค์ : | |
export OPENAI_API_KEY="your-api-key-here" | |
""") | |
demo = create_interface() | |
demo.launch(share=True) |