Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import openai | |
from openai import OpenAI | |
import numpy as np | |
import threading | |
import queue | |
import time | |
import json | |
import websocket | |
import base64 | |
import pyaudio | |
import wave | |
import io | |
from typing import Generator, Tuple | |
import asyncio | |
import edge_tts | |
# OpenAI API ํค ์ค์ | |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
client = OpenAI(api_key=OPENAI_API_KEY) | |
class RealtimeTranslator: | |
def __init__(self): | |
self.is_recording = False | |
self.audio_queue = queue.Queue() | |
self.text_queue = queue.Queue() | |
self.translation_queue = queue.Queue() | |
self.current_text = "" | |
self.detected_language = None | |
def detect_language(self, text: str) -> str: | |
"""ํ ์คํธ์ ์ธ์ด๋ฅผ ๊ฐ์งํฉ๋๋ค.""" | |
korean_chars = sum(1 for char in text if ord('๊ฐ') <= ord(char) <= ord('ํฃ')) | |
total_chars = len(text.replace(" ", "")) | |
if total_chars > 0: | |
korean_ratio = korean_chars / total_chars | |
if korean_ratio > 0.3: | |
return "ko" | |
return "en" | |
def process_audio_chunk(self, audio_chunk): | |
"""์ค๋์ค ์ฒญํฌ๋ฅผ ์ฒ๋ฆฌํ์ฌ ํ ์คํธ๋ก ๋ณํ""" | |
try: | |
# ์ค๋์ค ์ฒญํฌ๋ฅผ ์์ ํ์ผ๋ก ์ ์ฅ | |
import tempfile | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
# WAV ํ์ผ๋ก ์ ์ฅ | |
import wave | |
with wave.open(tmp_file.name, 'wb') as wav_file: | |
wav_file.setnchannels(1) | |
wav_file.setsampwidth(2) | |
wav_file.setframerate(16000) | |
wav_file.writeframes(audio_chunk) | |
# Whisper API ํธ์ถ | |
with open(tmp_file.name, "rb") as audio_file: | |
transcript = client.audio.transcriptions.create( | |
model="whisper-1", | |
file=audio_file, | |
language=None, | |
prompt="์ค์๊ฐ ๋ํ๋ฅผ ๋ฒ์ญํฉ๋๋ค." | |
) | |
return transcript.text | |
except Exception as e: | |
print(f"์์ฑ ์ธ์ ์ค๋ฅ: {e}") | |
return "" | |
def translate_stream(self, text: str, source_lang: str) -> str: | |
"""ํ ์คํธ๋ฅผ ์ค์๊ฐ์ผ๋ก ๋ฒ์ญ""" | |
try: | |
if not text or text.strip() == "": | |
return "" | |
# ๋ฒ์ญ ํ๋กฌํํธ | |
if source_lang == "ko": | |
messages = [ | |
{"role": "system", "content": "์ค์๊ฐ ํต์ญ์ฌ์ ๋๋ค. ํ๊ตญ์ด๋ฅผ ์์ด๋ก ์ฆ์ ๋ฒ์ญํฉ๋๋ค."}, | |
{"role": "user", "content": text} | |
] | |
else: | |
messages = [ | |
{"role": "system", "content": "์ค์๊ฐ ํต์ญ์ฌ์ ๋๋ค. ์์ด๋ฅผ ํ๊ตญ์ด๋ก ์ฆ์ ๋ฒ์ญํฉ๋๋ค."}, | |
{"role": "user", "content": text} | |
] | |
# ์คํธ๋ฆฌ๋ฐ ์๋ต | |
stream = client.chat.completions.create( | |
model="gpt-4o-mini", | |
messages=messages, | |
stream=True, | |
temperature=0.3, | |
max_tokens=150 | |
) | |
translated = "" | |
for chunk in stream: | |
if chunk.choices[0].delta.content: | |
translated += chunk.choices[0].delta.content | |
return translated | |
except Exception as e: | |
print(f"๋ฒ์ญ ์ค๋ฅ: {e}") | |
return "" | |
translator = RealtimeTranslator() | |
def process_stream(audio_stream): | |
"""์ค๋์ค ์คํธ๋ฆผ์ ์ค์๊ฐ์ผ๋ก ์ฒ๋ฆฌ""" | |
if audio_stream is None: | |
yield "๐ด ๋ง์ดํฌ๋ฅผ ์ผ๊ณ ๋ง์ํด์ฃผ์ธ์", "", "" | |
return | |
sample_rate, audio_data = audio_stream | |
# ์ค๋์ค ๋ฐ์ดํฐ๊ฐ ๋๋ฌด ์งง์ผ๋ฉด ๋ฌด์ | |
if len(audio_data) < sample_rate * 0.5: # 0.5์ด ๋ฏธ๋ง | |
yield "๐ค ๋ฃ๊ณ ์์ต๋๋ค...", "", "" | |
return | |
# ์ค๋์ค ์ฒญํฌ ์ฒ๋ฆฌ | |
audio_bytes = audio_data.tobytes() | |
# ์์ฑ์ ํ ์คํธ๋ก ๋ณํ | |
text = translator.process_audio_chunk(audio_bytes) | |
if text: | |
# ์ธ์ด ๊ฐ์ง | |
detected_lang = translator.detect_language(text) | |
# ์ค์๊ฐ ๋ฒ์ญ | |
translated = translator.translate_stream(text, detected_lang) | |
# ๊ฒฐ๊ณผ ๋ฐํ | |
if detected_lang == "ko": | |
yield f"โ ํ๊ตญ์ด ๊ฐ์ง", text, translated | |
else: | |
yield f"โ English detected", translated, text | |
def create_realtime_interface(): | |
with gr.Blocks(title="์ค์๊ฐ ์์ฑ ๋ฒ์ญ๊ธฐ", theme=gr.themes.Soft()) as demo: | |
gr.Markdown( | |
""" | |
# ๐ค ์ค์๊ฐ ์์ฑ ๋ฒ์ญ๊ธฐ (Real-time Voice Translator) | |
### ๋งํ๋ ๋์ ์ค์๊ฐ์ผ๋ก ๋ฒ์ญ๋ฉ๋๋ค! | |
๐ด **์์** ๋ฒํผ์ ๋๋ฅด๊ณ ๋งํ๋ฉด, ์ค์๊ฐ์ผ๋ก ๋ฒ์ญ์ด ํ์๋ฉ๋๋ค. | |
--- | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
audio_input = gr.Audio( | |
source="microphone", | |
type="numpy", | |
streaming=True, # ์คํธ๋ฆฌ๋ฐ ๋ชจ๋ ํ์ฑํ | |
label="๐ค ์ค์๊ฐ ๋ง์ดํฌ ์ ๋ ฅ", | |
elem_id="audio-stream" | |
) | |
with gr.Column(scale=1): | |
status_text = gr.Textbox( | |
label="๐ ์ํ", | |
value="๐ด ๋ง์ดํฌ๋ฅผ ์ผ๊ณ ๋ง์ํด์ฃผ์ธ์", | |
interactive=False | |
) | |
with gr.Row(): | |
with gr.Column(): | |
korean_output = gr.Textbox( | |
label="๐ฐ๐ท ํ๊ตญ์ด", | |
placeholder="ํ๊ตญ์ด๊ฐ ์ค์๊ฐ์ผ๋ก ํ์๋ฉ๋๋ค", | |
lines=8, | |
interactive=False, | |
elem_id="korean-text" | |
) | |
with gr.Column(): | |
english_output = gr.Textbox( | |
label="๐บ๐ธ English", | |
placeholder="English translation appears here in real-time", | |
lines=8, | |
interactive=False, | |
elem_id="english-text" | |
) | |
# ์คํธ๋ฆฌ๋ฐ ์ด๋ฒคํธ ์ค์ | |
audio_input.stream( | |
fn=process_stream, | |
inputs=[audio_input], | |
outputs=[status_text, korean_output, english_output], | |
show_progress=False | |
) | |
gr.Markdown( | |
""" | |
--- | |
### ๐ก ์ฌ์ฉ ํ: | |
- ๋ช ํํ๊ฒ ๋งํ ์๋ก ์ธ์๋ฅ ์ด ๋์์ง๋๋ค | |
- ๋ฌธ์ฅ์ด ๋๋ ๋๊น์ง ์ ์ ๋ฉ์ถ๋ฉด ๋ ์ ํํ ๋ฒ์ญ์ด ๋ฉ๋๋ค | |
- ํ๊ตญ์ด์ ์์ด๋ฅผ ์๋์ผ๋ก ๊ฐ์งํฉ๋๋ค | |
### โ๏ธ ๊ธฐ์ ์ฌ์: | |
- **์์ฑ ์ธ์**: OpenAI Whisper (์ค์๊ฐ ์คํธ๋ฆฌ๋ฐ) | |
- **๋ฒ์ญ**: GPT-4 (์คํธ๋ฆฌ๋ฐ ๋ชจ๋) | |
- **์ง์ฐ ์๊ฐ**: ~1-2์ด | |
""" | |
) | |
# CSS ์คํ์ผ ์ถ๊ฐ | |
demo.css = """ | |
#audio-stream { | |
height: 150px !important; | |
} | |
#korean-text, #english-text { | |
font-size: 18px !important; | |
line-height: 1.5 !important; | |
} | |
.gradio-container { | |
max-width: 1200px !important; | |
} | |
""" | |
return demo | |
# ๋์: WebSocket ๊ธฐ๋ฐ ์ค์๊ฐ ๋ฒ์ญ (๋ ๋ฎ์ ์ง์ฐ์๊ฐ) | |
class WebSocketTranslator: | |
def __init__(self): | |
self.ws_url = "wss://api.openai.com/v1/realtime" # ์์ URL | |
self.ws = None | |
self.is_connected = False | |
def connect(self): | |
"""WebSocket ์ฐ๊ฒฐ""" | |
headers = { | |
"Authorization": f"Bearer {OPENAI_API_KEY}", | |
"OpenAI-Beta": "realtime=v1" | |
} | |
try: | |
self.ws = websocket.WebSocketApp( | |
self.ws_url, | |
header=headers, | |
on_open=self.on_open, | |
on_message=self.on_message, | |
on_error=self.on_error, | |
on_close=self.on_close | |
) | |
# ๋ณ๋ ์ค๋ ๋์์ ์คํ | |
wst = threading.Thread(target=self.ws.run_forever) | |
wst.daemon = True | |
wst.start() | |
except Exception as e: | |
print(f"WebSocket ์ฐ๊ฒฐ ์ค๋ฅ: {e}") | |
def on_open(self, ws): | |
self.is_connected = True | |
print("WebSocket ์ฐ๊ฒฐ๋จ") | |
def on_message(self, ws, message): | |
"""๋ฉ์์ง ์์ ์ฒ๋ฆฌ""" | |
try: | |
data = json.loads(message) | |
if data.get("type") == "transcription": | |
# ์ค์๊ฐ ํ ์คํธ ์ฒ๋ฆฌ | |
text = data.get("text", "") | |
self.process_realtime_text(text) | |
except Exception as e: | |
print(f"๋ฉ์์ง ์ฒ๋ฆฌ ์ค๋ฅ: {e}") | |
def on_error(self, ws, error): | |
print(f"WebSocket ์ค๋ฅ: {error}") | |
def on_close(self, ws, close_status_code, close_msg): | |
self.is_connected = False | |
print("WebSocket ์ฐ๊ฒฐ ์ข ๋ฃ") | |
def send_audio(self, audio_data): | |
"""์ค๋์ค ๋ฐ์ดํฐ ์ ์ก""" | |
if self.is_connected and self.ws: | |
# ์ค๋์ค๋ฅผ base64๋ก ์ธ์ฝ๋ฉ | |
audio_base64 = base64.b64encode(audio_data).decode('utf-8') | |
message = { | |
"type": "audio", | |
"audio": audio_base64 | |
} | |
self.ws.send(json.dumps(message)) | |
def process_realtime_text(self, text): | |
"""์ค์๊ฐ ํ ์คํธ ์ฒ๋ฆฌ ๋ฐ ๋ฒ์ญ""" | |
# ์ธ์ด ๊ฐ์ง ๋ฐ ๋ฒ์ญ ๋ก์ง | |
pass | |
# ๋ฉ์ธ ์คํ | |
if __name__ == "__main__": | |
import os | |
# API ํค ํ์ธ | |
if OPENAI_API_KEY == "your-api-key-here": | |
api_key = os.getenv("OPENAI_API_KEY") | |
if api_key: | |
OPENAI_API_KEY = api_key | |
client = OpenAI(api_key=OPENAI_API_KEY) | |
else: | |
print("โ ๏ธ ๊ฒฝ๊ณ : OpenAI API ํค๋ฅผ ์ค์ ํด์ฃผ์ธ์!") | |
print("ํ๊ฒฝ ๋ณ์ OPENAI_API_KEY๋ฅผ ์ค์ ํ๊ฑฐ๋ ์ฝ๋์ ์ง์ ์ ๋ ฅํ์ธ์.") | |
# Gradio ์ฑ ์คํ | |
demo = create_realtime_interface() | |
demo.queue() # ํ ํ์ฑํ (์คํธ๋ฆฌ๋ฐ์ ํ์) | |
demo.launch( | |
share=False, | |
server_name="0.0.0.0", | |
server_port=7860, | |
debug=True | |
) |