voice-trans / app.py
openfree's picture
Update app.py
b3067c5 verified
raw
history blame
13.3 kB
import gradio as gr
import websocket
import json
import base64
import numpy as np
import threading
import queue
import os
from datetime import datetime
import pyaudio
import wave
import io
class RealtimeTranslator:
def __init__(self):
self.ws = None
self.api_key = os.getenv("OPENAI_API_KEY")
self.audio_queue = queue.Queue()
self.transcript_queue = queue.Queue()
self.translation_queue = queue.Queue()
self.is_connected = False
self.is_recording = False
self.source_lang = "ko"
self.target_lang = "en"
# PyAudio ์„ค์ •
self.p = pyaudio.PyAudio()
self.sample_rate = 24000
self.chunk_size = 1024
self.audio_format = pyaudio.paInt16
def connect_websocket(self):
"""WebSocket ์—ฐ๊ฒฐ ์„ค์ •"""
try:
url = "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-12-17"
headers = {
"Authorization": f"Bearer {self.api_key}",
"OpenAI-Beta": "realtime=v1"
}
self.ws = websocket.WebSocketApp(
url,
header=headers,
on_open=self.on_open,
on_message=self.on_message,
on_error=self.on_error,
on_close=self.on_close
)
# WebSocket์„ ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ ์‹คํ–‰
wst = threading.Thread(target=self.ws.run_forever)
wst.daemon = True
wst.start()
return "์—ฐ๊ฒฐ ์„ฑ๊ณต"
except Exception as e:
return f"์—ฐ๊ฒฐ ์‹คํŒจ: {str(e)}"
def on_open(self, ws):
"""WebSocket ์—ฐ๊ฒฐ ์‹œ ํ˜ธ์ถœ"""
self.is_connected = True
print("WebSocket ์—ฐ๊ฒฐ๋จ")
# ์„ธ์…˜ ์„ค์ •
session_update = {
"type": "session.update",
"session": {
"modalities": ["text", "audio"],
"instructions": f"You are a helpful translator. Translate between {self.get_language_name(self.source_lang)} and {self.get_language_name(self.target_lang)}. Respond with both the transcription and translation.",
"voice": "alloy",
"input_audio_format": "pcm16",
"output_audio_format": "pcm16",
"input_audio_transcription": {
"model": "whisper-1"
},
"turn_detection": {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 500
}
}
}
ws.send(json.dumps(session_update))
def on_message(self, ws, message):
"""๋ฉ”์‹œ์ง€ ์ˆ˜์‹  ์‹œ ํ˜ธ์ถœ"""
try:
event = json.loads(message)
event_type = event.get("type")
if event_type == "conversation.item.input_audio_transcription.completed":
# ์Œ์„ฑ ์ „์‚ฌ ์™„๋ฃŒ
transcript = event.get("transcript", "")
self.transcript_queue.put(transcript)
# ๋ฒˆ์—ญ ์š”์ฒญ
self.request_translation(transcript)
elif event_type == "response.text.delta":
# ๋ฒˆ์—ญ ๊ฒฐ๊ณผ ์ˆ˜์‹ 
delta = event.get("delta", "")
self.translation_queue.put(delta)
elif event_type == "response.audio.delta":
# ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ์ˆ˜์‹ 
audio_data = base64.b64decode(event.get("delta", ""))
self.audio_queue.put(audio_data)
elif event_type == "error":
error_msg = event.get("error", {}).get("message", "Unknown error")
print(f"Error: {error_msg}")
except Exception as e:
print(f"๋ฉ”์‹œ์ง€ ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {str(e)}")
def on_error(self, ws, error):
"""์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ํ˜ธ์ถœ"""
print(f"WebSocket ์˜ค๋ฅ˜: {error}")
self.is_connected = False
def on_close(self, ws, close_status_code, close_msg):
"""์—ฐ๊ฒฐ ์ข…๋ฃŒ ์‹œ ํ˜ธ์ถœ"""
print("WebSocket ์—ฐ๊ฒฐ ์ข…๋ฃŒ")
self.is_connected = False
def get_language_name(self, lang_code):
"""์–ธ์–ด ์ฝ”๋“œ๋ฅผ ์–ธ์–ด ์ด๋ฆ„์œผ๋กœ ๋ณ€ํ™˜"""
languages = {
"ko": "Korean",
"en": "English",
"ja": "Japanese",
"zh": "Chinese",
"es": "Spanish",
"fr": "French"
}
return languages.get(lang_code, lang_code)
def request_translation(self, text):
"""๋ฒˆ์—ญ ์š”์ฒญ"""
if not self.ws or not self.is_connected:
return
message = {
"type": "conversation.item.create",
"item": {
"type": "message",
"role": "user",
"content": [{
"type": "input_text",
"text": f"Translate this {self.get_language_name(self.source_lang)} text to {self.get_language_name(self.target_lang)}: '{text}'"
}]
}
}
self.ws.send(json.dumps(message))
# ์‘๋‹ต ์ƒ์„ฑ ์š”์ฒญ
response_create = {"type": "response.create"}
self.ws.send(json.dumps(response_create))
def send_audio_chunk(self, audio_data):
"""์˜ค๋””์˜ค ์ฒญํฌ ์ „์†ก"""
if not self.ws or not self.is_connected:
return
# PCM16 ํ˜•์‹์œผ๋กœ ์ธ์ฝ”๋”ฉ
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
message = {
"type": "input_audio_buffer.append",
"audio": audio_base64
}
self.ws.send(json.dumps(message))
def process_audio(self, audio_file):
"""์˜ค๋””์˜ค ํŒŒ์ผ ์ฒ˜๋ฆฌ ๋ฐ ์ „์†ก"""
if not self.is_connected:
return "WebSocket์ด ์—ฐ๊ฒฐ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.", ""
try:
# ์˜ค๋””์˜ค ํŒŒ์ผ ์ฝ๊ธฐ
with wave.open(audio_file, 'rb') as wf:
# ์˜ค๋””์˜ค๋ฅผ 24kHz PCM16์œผ๋กœ ๋ณ€ํ™˜ ํ•„์š”
audio_data = wf.readframes(wf.getnframes())
# ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ๋ฅผ ์ฒญํฌ๋กœ ๋‚˜๋ˆ„์–ด ์ „์†ก
chunk_size = 4096
for i in range(0, len(audio_data), chunk_size):
chunk = audio_data[i:i+chunk_size]
self.send_audio_chunk(chunk)
# ์˜ค๋””์˜ค ๋ฒ„ํผ ์ปค๋ฐ‹
commit_message = {"type": "input_audio_buffer.commit"}
self.ws.send(json.dumps(commit_message))
# ์ „์‚ฌ ๋ฐ ๋ฒˆ์—ญ ๊ฒฐ๊ณผ ๋Œ€๊ธฐ
transcript = ""
translation = ""
# ํƒ€์ž„์•„์›ƒ ์„ค์ • (10์ดˆ)
import time
timeout = 10
start_time = time.time()
while time.time() - start_time < timeout:
# ์ „์‚ฌ ๊ฒฐ๊ณผ ํ™•์ธ
try:
transcript = self.transcript_queue.get(timeout=0.1)
except queue.Empty:
pass
# ๋ฒˆ์—ญ ๊ฒฐ๊ณผ ํ™•์ธ
try:
while not self.translation_queue.empty():
translation += self.translation_queue.get()
except queue.Empty:
pass
if transcript and translation:
break
return transcript, translation
except Exception as e:
return f"์˜ค๋ฅ˜: {str(e)}", ""
def disconnect(self):
"""WebSocket ์—ฐ๊ฒฐ ์ข…๋ฃŒ"""
if self.ws:
self.ws.close()
self.is_connected = False
return "์—ฐ๊ฒฐ ์ข…๋ฃŒ๋จ"
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ƒ์„ฑ
def create_interface():
translator = RealtimeTranslator()
def connect():
if not translator.api_key:
return "API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ OPENAI_API_KEY๋ฅผ ์„ค์ •ํ•˜์„ธ์š”.", gr.update(value=False)
result = translator.connect_websocket()
return result, gr.update(value=translator.is_connected)
def disconnect():
result = translator.disconnect()
return result, gr.update(value=False)
def translate_audio(audio_file, source_lang, target_lang):
if not audio_file:
return "์˜ค๋””์˜ค ํŒŒ์ผ์„ ์„ ํƒํ•˜์„ธ์š”.", "", None
translator.source_lang = source_lang
translator.target_lang = target_lang
transcript, translation = translator.process_audio(audio_file)
# ์˜ค๋””์˜ค ์‘๋‹ต ์ฒ˜๋ฆฌ (ํ˜„์žฌ๋Š” ํ…์ŠคํŠธ๋งŒ ๋ฐ˜ํ™˜)
return transcript, translation, None
def swap_languages(source, target):
return target, source
with gr.Blocks(title="์‹ค์‹œ๊ฐ„ ์Œ์„ฑ ๋ฒˆ์—ญ๊ธฐ") as demo:
gr.Markdown("# ๐ŸŽ™๏ธ OpenAI Realtime API ์Œ์„ฑ ๋ฒˆ์—ญ๊ธฐ")
gr.Markdown("์‹ค์‹œ๊ฐ„์œผ๋กœ ์Œ์„ฑ์„ ์ „์‚ฌํ•˜๊ณ  ๋ฒˆ์—ญํ•ฉ๋‹ˆ๋‹ค.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### ์—ฐ๊ฒฐ ์ƒํƒœ")
connection_status = gr.Checkbox(label="์—ฐ๊ฒฐ๋จ", value=False, interactive=False)
connect_btn = gr.Button("์—ฐ๊ฒฐ", variant="primary")
disconnect_btn = gr.Button("์—ฐ๊ฒฐ ์ข…๋ฃŒ", variant="secondary")
status_text = gr.Textbox(label="์ƒํƒœ ๋ฉ”์‹œ์ง€", value="์—ฐ๊ฒฐ๋˜์ง€ ์•Š์Œ")
with gr.Row():
with gr.Column(scale=2):
gr.Markdown("### ์–ธ์–ด ์„ค์ •")
with gr.Row():
source_lang = gr.Dropdown(
choices=[("ํ•œ๊ตญ์–ด", "ko"), ("์˜์–ด", "en"), ("์ผ๋ณธ์–ด", "ja"),
("์ค‘๊ตญ์–ด", "zh"), ("์ŠคํŽ˜์ธ์–ด", "es"), ("ํ”„๋ž‘์Šค์–ด", "fr")],
value="ko",
label="์ž…๋ ฅ ์–ธ์–ด"
)
swap_btn = gr.Button("โ†”๏ธ", scale=0)
target_lang = gr.Dropdown(
choices=[("ํ•œ๊ตญ์–ด", "ko"), ("์˜์–ด", "en"), ("์ผ๋ณธ์–ด", "ja"),
("์ค‘๊ตญ์–ด", "zh"), ("์ŠคํŽ˜์ธ์–ด", "es"), ("ํ”„๋ž‘์Šค์–ด", "fr")],
value="en",
label="์ถœ๋ ฅ ์–ธ์–ด"
)
with gr.Row():
with gr.Column():
gr.Markdown("### ์Œ์„ฑ ์ž…๋ ฅ")
audio_input = gr.Audio(
source="microphone",
type="filepath",
label="๋…น์Œํ•˜๊ธฐ"
)
translate_btn = gr.Button("๋ฒˆ์—ญํ•˜๊ธฐ", variant="primary")
with gr.Row():
with gr.Column():
gr.Markdown("### ๊ฒฐ๊ณผ")
transcript_output = gr.Textbox(
label="์ „์‚ฌ๋œ ํ…์ŠคํŠธ",
placeholder="์Œ์„ฑ ์ „์‚ฌ ๊ฒฐ๊ณผ๊ฐ€ ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค...",
lines=3
)
translation_output = gr.Textbox(
label="๋ฒˆ์—ญ๋œ ํ…์ŠคํŠธ",
placeholder="๋ฒˆ์—ญ ๊ฒฐ๊ณผ๊ฐ€ ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค...",
lines=3
)
audio_output = gr.Audio(
label="๋ฒˆ์—ญ๋œ ์Œ์„ฑ",
type="filepath"
)
# ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
connect_btn.click(
fn=connect,
outputs=[status_text, connection_status]
)
disconnect_btn.click(
fn=disconnect,
outputs=[status_text, connection_status]
)
swap_btn.click(
fn=swap_languages,
inputs=[source_lang, target_lang],
outputs=[source_lang, target_lang]
)
translate_btn.click(
fn=translate_audio,
inputs=[audio_input, source_lang, target_lang],
outputs=[transcript_output, translation_output, audio_output]
)
gr.Markdown("""
### ๐Ÿ“ ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
1. **์—ฐ๊ฒฐ** ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜์—ฌ OpenAI Realtime API์— ์—ฐ๊ฒฐํ•ฉ๋‹ˆ๋‹ค.
2. ์ž…๋ ฅ ์–ธ์–ด์™€ ์ถœ๋ ฅ ์–ธ์–ด๋ฅผ ์„ ํƒํ•ฉ๋‹ˆ๋‹ค.
3. ๋งˆ์ดํฌ ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜์—ฌ ์Œ์„ฑ์„ ๋…น์Œํ•ฉ๋‹ˆ๋‹ค.
4. **๋ฒˆ์—ญํ•˜๊ธฐ** ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜๋ฉด ์ „์‚ฌ ๋ฐ ๋ฒˆ์—ญ์ด ์ง„ํ–‰๋ฉ๋‹ˆ๋‹ค.
### โš ๏ธ ์ฃผ์˜์‚ฌํ•ญ
- ํ™˜๊ฒฝ ๋ณ€์ˆ˜ `OPENAI_API_KEY`๊ฐ€ ์„ค์ •๋˜์–ด ์žˆ์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
- ๊ธด ์˜ค๋””์˜ค์˜ ๊ฒฝ์šฐ ์ฒ˜๋ฆฌ ์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
""")
return demo
# ์‹คํ–‰
if __name__ == "__main__":
# ํ•„์š”ํ•œ ํŒจํ‚ค์ง€ ์„ค์น˜ ์•ˆ๋‚ด
print("""
ํ•„์š”ํ•œ ํŒจํ‚ค์ง€:
pip install gradio websocket-client pyaudio wave numpy
ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ •:
export OPENAI_API_KEY="your-api-key-here"
""")
demo = create_interface()
demo.launch(share=True)