voice-trans / app.py
openfree's picture
Update app.py
8000eeb verified
raw
history blame
13.5 kB
import os, asyncio, json, tempfile, websockets, pdfplumber
import gradio as gr
import openai
from dotenv import load_dotenv
import numpy as np
import wave
# โ”€โ”€โ”€ 0. ์ดˆ๊ธฐํ™” โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
raise RuntimeError("OPENAI_API_KEY ๊ฐ€ .env ์— ์—†์Šต๋‹ˆ๋‹ค!")
LANG = ["Korean","English","Japanese","Chinese",
"Thai","Russian","Vietnamese","Spanish","French"]
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
for l in LANG}
FOUR = ["English","Chinese","Thai","Russian"]
WS_URL = "wss://api.openai.com/v1/realtime" # ์˜ฌ๋ฐ”๋ฅธ ์—”๋“œํฌ์ธํŠธ๋กœ ์ˆ˜์ •
# โ”€โ”€โ”€ 1. ๊ณตํ†ต GPT ๋ฒˆ์—ญ / TTS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
async def gpt_translate(text, src, tgt):
rsp = await openai.AsyncClient().chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role":"system",
"content":f"Translate {src} โ†’ {tgt}. Return only the text."},
{"role":"user","content":text}],
temperature=0.3,max_tokens=2048)
return rsp.choices[0].message.content.strip()
async def gpt_tts(text, lang):
rsp = await openai.AsyncClient().audio.speech.create(
model="tts-1", voice=VOICE[lang], input=text[:4096])
tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
tmp.write(rsp.content); tmp.close(); return tmp.name
# โ”€โ”€โ”€ 2. PDF ๋ฒˆ์—ญ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def translate_pdf(file, src, tgt):
if not file: return "โš ๏ธ PDF ์—…๋กœ๋“œ ํ•„์š”", ""
with pdfplumber.open(file.name) as pdf:
text = "\n".join(p.extract_text() or "" for p in pdf.pages[:5]).strip()
if not text:
return "โš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ", ""
return text, asyncio.run(gpt_translate(text, src, tgt))
# โ”€โ”€โ”€ 2-1. ์˜ค๋””์˜ค ๋ฒˆ์—ญ (ํƒญ1์šฉ) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
async def translate_audio_async(file, src, tgt):
if not file: return "โš ๏ธ ์˜ค๋””์˜ค ์—…๋กœ๋“œ ํ•„์š”", "", None
# STT: Whisper API ์‚ฌ์šฉ
with open(file, 'rb') as audio_file:
transcript = await openai.AsyncClient().audio.transcriptions.create(
model="whisper-1",
file=audio_file,
language=src[:2].lower() # ์–ธ์–ด ์ฝ”๋“œ ๊ฐ„์†Œํ™”
)
orig_text = transcript.text
trans_text = await gpt_translate(orig_text, src, tgt)
audio_path = await gpt_tts(trans_text, tgt)
return orig_text, trans_text, audio_path
def translate_audio(file, src, tgt):
return asyncio.run(translate_audio_async(file, src, tgt))
# โ”€โ”€โ”€ 3. ์‹ค์‹œ๊ฐ„ STT (Whisper API ์‚ฌ์šฉ) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
async def process_audio_chunk(audio_data, src_lang):
"""์˜ค๋””์˜ค ์ฒญํฌ๋ฅผ ์ฒ˜๋ฆฌํ•˜์—ฌ ํ…์ŠคํŠธ๋กœ ๋ณ€ํ™˜"""
if audio_data is None:
return ""
try:
# Gradio๋Š” (sample_rate, audio_array) ํŠœํ”Œ์„ ๋ฐ˜ํ™˜
if isinstance(audio_data, tuple):
sample_rate, audio_array = audio_data
# numpy array๋ฅผ WAV ํŒŒ์ผ๋กœ ๋ณ€ํ™˜
import numpy as np
import wave
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
with wave.open(tmp.name, 'wb') as wav_file:
wav_file.setnchannels(1) # mono
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(sample_rate)
# numpy array๋ฅผ 16-bit PCM์œผ๋กœ ๋ณ€ํ™˜
if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
audio_array = (audio_array * 32767).astype(np.int16)
wav_file.writeframes(audio_array.tobytes())
tmp_path = tmp.name
else:
# bytes ๋ฐ์ดํ„ฐ์ธ ๊ฒฝ์šฐ
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(audio_data)
tmp_path = tmp.name
# Whisper API๋กœ ๋ณ€ํ™˜
with open(tmp_path, 'rb') as audio_file:
transcript = await openai.AsyncClient().audio.transcriptions.create(
model="whisper-1",
file=audio_file,
language=src_lang[:2].lower()
)
os.unlink(tmp_path) # ์ž„์‹œ ํŒŒ์ผ ์‚ญ์ œ
return transcript.text
except Exception as e:
print(f"STT ์˜ค๋ฅ˜: {e}")
return ""
# โ”€โ”€โ”€ 4. Gradio ์ŠคํŠธ๋ฆผ ํ•ธ๋“ค๋Ÿฌ (๋™๊ธฐ ๋ฒ„์ „) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def realtime_single_sync(audio, src, tgt, state):
"""๋™๊ธฐ ๋ฒ„์ „์˜ ์‹ค์‹œ๊ฐ„ ๋‹จ์ผ ์–ธ์–ด ๋ฒˆ์—ญ"""
if state is None:
state = {"orig": "", "trans": "", "audio_buffer": [], "sample_rate": None}
if audio is None:
# ์ŠคํŠธ๋ฆผ ์ข…๋ฃŒ ์‹œ ๋‚จ์€ ๋ฒ„ํผ ์ฒ˜๋ฆฌ
if state["audio_buffer"] and state["sample_rate"]:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# ๋ฒ„ํผ์˜ ์˜ค๋””์˜ค ํ•ฉ์น˜๊ธฐ
combined_audio = np.concatenate(state["audio_buffer"])
audio_data = (state["sample_rate"], combined_audio)
text = loop.run_until_complete(process_audio_chunk(audio_data, src))
if text:
state["orig"] = state["orig"] + " " + text if state["orig"] else text
trans = loop.run_until_complete(gpt_translate(text, src, tgt))
state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
finally:
loop.close()
state["audio_buffer"] = []
return state["orig"], state["trans"], state
# ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ๋ฒ„ํผ๋ง
if isinstance(audio, tuple):
sample_rate, audio_array = audio
state["sample_rate"] = sample_rate
state["audio_buffer"].append(audio_array)
# ๋ฒ„ํผ๊ฐ€ ์ถฉ๋ถ„ํžˆ ์Œ“์˜€์„ ๋•Œ๋งŒ ์ฒ˜๋ฆฌ (์•ฝ 1-2์ดˆ ๋ถ„๋Ÿ‰)
buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
if buffer_duration >= 1.5: # 1.5์ดˆ๋งˆ๋‹ค ์ฒ˜๋ฆฌ
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# ๋ฒ„ํผ์˜ ์˜ค๋””์˜ค ํ•ฉ์น˜๊ธฐ
combined_audio = np.concatenate(state["audio_buffer"])
audio_data = (sample_rate, combined_audio)
# STT
text = loop.run_until_complete(process_audio_chunk(audio_data, src))
if text:
state["orig"] = state["orig"] + " " + text if state["orig"] else text
# ๋ฒˆ์—ญ
trans = loop.run_until_complete(gpt_translate(text, src, tgt))
state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
# ๋ฒ„ํผ ์ดˆ๊ธฐํ™”
state["audio_buffer"] = []
finally:
loop.close()
return state["orig"], state["trans"], state
def realtime_four_sync(audio, src, state):
"""๋™๊ธฐ ๋ฒ„์ „์˜ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด ๋ฒˆ์—ญ"""
if state is None:
state = {"orig": "", "English": "", "Chinese": "", "Thai": "", "Russian": "",
"audio_buffer": [], "sample_rate": None}
if audio is None:
# ์ŠคํŠธ๋ฆผ ์ข…๋ฃŒ ์‹œ ๋‚จ์€ ๋ฒ„ํผ ์ฒ˜๋ฆฌ
if state["audio_buffer"] and state["sample_rate"]:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
combined_audio = np.concatenate(state["audio_buffer"])
audio_data = (state["sample_rate"], combined_audio)
text = loop.run_until_complete(process_audio_chunk(audio_data, src))
if text:
state["orig"] = state["orig"] + " " + text if state["orig"] else text
tasks = []
for lang in FOUR:
tasks.append(gpt_translate(text, src, lang))
translations = loop.run_until_complete(asyncio.gather(*tasks))
for lang, trans in zip(FOUR, translations):
state[lang] = state[lang] + " " + trans if state[lang] else trans
finally:
loop.close()
state["audio_buffer"] = []
return (state["orig"], state["English"], state["Chinese"],
state["Thai"], state["Russian"], state)
# ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ๋ฒ„ํผ๋ง
if isinstance(audio, tuple):
sample_rate, audio_array = audio
state["sample_rate"] = sample_rate
state["audio_buffer"].append(audio_array)
# ๋ฒ„ํผ๊ฐ€ ์ถฉ๋ถ„ํžˆ ์Œ“์˜€์„ ๋•Œ๋งŒ ์ฒ˜๋ฆฌ
buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
if buffer_duration >= 1.5: # 1.5์ดˆ๋งˆ๋‹ค ์ฒ˜๋ฆฌ
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
combined_audio = np.concatenate(state["audio_buffer"])
audio_data = (sample_rate, combined_audio)
# STT
text = loop.run_until_complete(process_audio_chunk(audio_data, src))
if text:
state["orig"] = state["orig"] + " " + text if state["orig"] else text
# 4๊ฐœ ์–ธ์–ด๋กœ ๋ฒˆ์—ญ
tasks = []
for lang in FOUR:
tasks.append(gpt_translate(text, src, lang))
translations = loop.run_until_complete(asyncio.gather(*tasks))
for lang, trans in zip(FOUR, translations):
state[lang] = state[lang] + " " + trans if state[lang] else trans
state["audio_buffer"] = []
finally:
loop.close()
return (state["orig"], state["English"], state["Chinese"],
state["Thai"], state["Russian"], state)
# โ”€โ”€โ”€ 5. UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(title="SMARTok Demo") as demo:
with gr.Tabs():
# ํƒญ 1 โ€“ ์˜ค๋””์˜ค ๋ฒˆ์—ญ
with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค"):
src1 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
tgt1 = gr.Dropdown(LANG, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
aud1 = gr.Audio(sources=["microphone", "upload"], type="filepath")
btn1 = gr.Button("๋ฒˆ์—ญ")
o1 = gr.Textbox(label="์›๋ฌธ")
t1 = gr.Textbox(label="๋ฒˆ์—ญ")
a1 = gr.Audio(label="TTS", type="filepath", autoplay=True)
btn1.click(translate_audio, [aud1, src1, tgt1], [o1, t1, a1])
# ํƒญ 2 โ€“ PDF ๋ฒˆ์—ญ
with gr.TabItem("๐Ÿ“„ PDF"):
src2 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
tgt2 = gr.Dropdown(LANG, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
pdf = gr.File(file_types=[".pdf"])
btn2 = gr.Button("๋ฒˆ์—ญ")
o2 = gr.Textbox(label="์ถ”์ถœ ์›๋ฌธ", lines=15)
t2 = gr.Textbox(label="๋ฒˆ์—ญ ๊ฒฐ๊ณผ", lines=15)
btn2.click(translate_pdf, [pdf, src2, tgt2], [o2, t2])
# ํƒญ 3 โ€“ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด
with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1"):
src3 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
tgt3 = gr.Dropdown(LANG, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
mic3 = gr.Audio(sources=["microphone"], streaming=True)
o3 = gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)", lines=8)
t3 = gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)", lines=8)
st3 = gr.State()
# stream ๋ฉ”์„œ๋“œ ์ˆ˜์ •
mic3.stream(
realtime_single_sync,
inputs=[mic3, src3, tgt3, st3],
outputs=[o3, t3, st3]
)
# ํƒญ 4 โ€“ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด
with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4"):
src4 = gr.Dropdown(LANG, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
mic4 = gr.Audio(sources=["microphone"], streaming=True)
o4 = gr.Textbox(label="์›๋ฌธ", lines=8)
e4 = gr.Textbox(label="English", lines=8)
c4 = gr.Textbox(label="Chinese(็ฎ€ไฝ“)", lines=8)
th4 = gr.Textbox(label="Thai", lines=8)
r4 = gr.Textbox(label="Russian", lines=8)
st4 = gr.State()
# stream ๋ฉ”์„œ๋“œ ์ˆ˜์ •
mic4.stream(
realtime_four_sync,
inputs=[mic4, src4, st4],
outputs=[o4, e4, c4, th4, r4, st4]
)
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)