voice-trans / app.py
openfree's picture
Update app.py
2adfcbe verified
raw
history blame
8.56 kB
import gradio as gr
import openai, os, io, tempfile
from dotenv import load_dotenv
# ============== ํ™˜๊ฒฝ ๋ณ€์ˆ˜ & OpenAI ์ดˆ๊ธฐํ™” ===========================
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY๋ฅผ .env ํŒŒ์ผ์— ์„ค์ •ํ•˜์„ธ์š”!")
client = openai.OpenAI(api_key=api_key)
# ============== ์–ธ์–ด ์„ค์ • ===========================================
LANGUAGES = [
"Korean", "English", "Japanese", "Chinese",
"Thai", "Russian", "Vietnamese",
"Spanish", "French"
]
LANG_CODE = {
"Korean": "ko", "English": "en", "Japanese": "ja", "Chinese": "zh",
"Thai": "th", "Russian": "ru", "Vietnamese": "vi",
"Spanish": "es", "French": "fr"
}
VOICE = {
lang: ("nova" if lang in ["Korean", "Japanese", "Chinese"] else "alloy")
for lang in LANGUAGES
}
# ============== ๊ณตํ†ต ํ•จ์ˆ˜ ===========================================
def _gpt_translate(text: str, src: str, tgt: str) -> str:
"""GPT-3.5 ๋ฒˆ์—ญ"""
rsp = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": (
f"You are a professional translator. Translate the following {src} text to {tgt}. "
f"Only provide the translation without additional commentary."
)
},
{"role": "user", "content": text}
],
temperature=0.3,
max_tokens=2048
)
return rsp.choices[0].message.content.strip()
def _tts(text: str, lang: str) -> str:
"""TTS-1 ์Œ์„ฑ ํ•ฉ์„ฑ โ†’ ์ž„์‹œ mp3 ๊ฒฝ๋กœ ๋ฐ˜ํ™˜"""
out = client.audio.speech.create(
model="tts-1",
voice=VOICE.get(lang, "alloy"),
input=text[:4096]
)
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tmp.write(out.content)
tmp.close()
return tmp.name
def translate_audio(audio_path, src, tgt):
"""๋‹จ์ผ ์Œ์„ฑ ํŒŒ์ผ ๋ฒˆ์—ญ(TTS ํฌํ•จ)"""
with open(audio_path, "rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1",
file=f,
language=LANG_CODE.get(src)
)
original = stt.text.strip()
if not original:
return "โš ๏ธ ์Œ์„ฑ ์ธ์‹ ์‹คํŒจ", "", None
translated = _gpt_translate(original, src, tgt)
tts_path = _tts(translated, tgt)
return original, translated, tts_path
# ============== ์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ ์ „์‚ฌ/๋ฒˆ์—ญ ============================
STREAM_CHUNK_SEC = 4 # Whisper ํ˜ธ์ถœ ์ฃผ๊ธฐ(์ดˆ)
def stream_generator(mic_stream, src, tgt):
"""๋งˆ์ดํฌ ์ŠคํŠธ๋ฆผ -> ์ฃผ๊ธฐ์  ์ฒญํฌ ๋ฒˆ์—ญ(๋ˆ„์  ์ถœ๋ ฅ)"""
buffer = io.BytesIO()
wav_header = None
original_acc, translated_acc = "", ""
while True:
chunk = mic_stream.recv()
if chunk is None: # ์ŠคํŠธ๋ฆผ ์ข…๋ฃŒ
break
if not wav_header:
wav_header = chunk[:44] # WAV ํ—ค๋”(16kHz 16-bit mono)
buffer.write(chunk)
# ์ง€์ • ์‹œ๊ฐ„๋งŒํผ ์Œ“์ด๋ฉด Whisper ํ˜ธ์ถœ
if buffer.getbuffer().nbytes > 16000 * 2 * STREAM_CHUNK_SEC:
wav_bytes = wav_header + buffer.getvalue()
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(wav_bytes)
tmp.close()
o, t, _ = translate_audio(tmp.name, src, tgt)
original_acc += " " + o
translated_acc += " " + t
yield original_acc.strip(), translated_acc.strip()
buffer = io.BytesIO() # ๋ฒ„ํผ ์ดˆ๊ธฐํ™”
# ๋งˆ์ง€๋ง‰ ๋‚จ์€ ๋ฒ„ํผ ์ฒ˜๋ฆฌ
if buffer.getbuffer().nbytes:
wav_bytes = wav_header + buffer.getvalue()
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(wav_bytes)
tmp.close()
o, t, _ = translate_audio(tmp.name, src, tgt)
yield (original_acc + " " + o).strip(), (translated_acc + " " + t).strip()
# ============== 4๊ฐœ๊ตญ ๋™์‹œ ๋ฒˆ์—ญ =====================================
FOUR_LANGS = ["English", "Chinese", "Thai", "Russian"]
def translate_audio_four(audio_path, src):
"""์Œ์„ฑ ํŒŒ์ผ โ†’ ์›๋ฌธ + 4๊ฐœ ์–ธ์–ด ๋™์‹œ ๋ฒˆ์—ญ"""
with open(audio_path, "rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1",
file=f,
language=LANG_CODE.get(src)
)
original = stt.text.strip()
if not original:
return ["โš ๏ธ ์Œ์„ฑ ์ธ์‹ ์‹คํŒจ"] + [""] * 4
outs = [original]
for lang in FOUR_LANGS:
outs.append(_gpt_translate(original, src, lang))
return outs # ์ด 5๊ฐœ(์›๋ฌธ+4์–ธ์–ด)
# ============== Gradio UI ===========================================
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
with gr.Tabs():
# โ‘  ๋งˆ์ดํฌ/ํŒŒ์ผ ๋ฒˆ์—ญ
with gr.TabItem("๐ŸŽ™๏ธ ๋งˆ์ดํฌ/ํŒŒ์ผ ๋ฒˆ์—ญ"):
src1 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ")
tgt1 = gr.Dropdown(LANGUAGES, value="English", label="์ถœ๋ ฅ")
mic1 = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="๐ŸŽค ๋…น์Œ ๋˜๋Š” ์˜ค๋””์˜ค ํŒŒ์ผ ์—…๋กœ๋“œ"
)
btn1 = gr.Button("๋ฒˆ์—ญ")
stt1 = gr.Textbox(label="์›๋ฌธ", lines=5)
tlt1 = gr.Textbox(label="๋ฒˆ์—ญ", lines=5)
out1 = gr.Audio(label="TTS", type="filepath", autoplay=True)
btn1.click(
translate_audio,
inputs=[mic1, src1, tgt1],
outputs=[stt1, tlt1, out1]
)
# โ‘ก ํŒŒ์ผ ์ „์šฉ ๋ฒˆ์—ญ
with gr.TabItem("๐ŸŽง ํŒŒ์ผ ์ „์šฉ ๋ฒˆ์—ญ"):
src2 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ")
tgt2 = gr.Dropdown(LANGUAGES, value="English", label="์ถœ๋ ฅ")
file2 = gr.Audio(
sources=["upload"],
type="filepath",
label="์˜ค๋””์˜ค ํŒŒ์ผ ์—…๋กœ๋“œ"
)
btn2 = gr.Button("๋ฒˆ์—ญ")
stt2 = gr.Textbox(label="์›๋ฌธ", lines=5)
tlt2 = gr.Textbox(label="๋ฒˆ์—ญ", lines=5)
out2 = gr.Audio(label="TTS", type="filepath", autoplay=True)
btn2.click(
translate_audio,
inputs=[file2, src2, tgt2],
outputs=[stt2, tlt2, out2]
)
# โ‘ข ์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ ๋ฒˆ์—ญ(Beta)
with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ ๋ฒˆ์—ญ (Beta)"):
gr.Markdown("๋งˆ์ดํฌ๋ฅผ ์ผœ๋ฉด 3~4์ดˆ ๊ฐ„๊ฒฉ์œผ๋กœ ์ž๋ง‰์ด ๊ฐฑ์‹ ๋ฉ๋‹ˆ๋‹ค.")
src3 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ")
tgt3 = gr.Dropdown(LANGUAGES, value="English", label="์ถœ๋ ฅ")
mic3 = gr.Audio(
sources=["microphone"],
streaming=True,
label="๐ŸŽค ์‹ค์‹œ๊ฐ„ ๋งˆ์ดํฌ ์ž…๋ ฅ"
)
stt3 = gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)", lines=8)
tlt3 = gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)", lines=8)
def gen(audio, src_lang, tgt_lang):
yield from stream_generator(audio, src_lang, tgt_lang)
mic3.stream(gen, inputs=[src3, tgt3], outputs=[stt3, tlt3])
# โ‘ฃ 4๊ฐœ ์–ธ์–ด ๋™์‹œ ๋ฒˆ์—ญ
with gr.TabItem("๐ŸŒ 4๊ฐœ ์–ธ์–ด ๋™์‹œ"):
gr.Markdown("์ž…๋ ฅ ์Œ์„ฑ์„ **English / Chinese(็ฎ€ไฝ“) / Thai / Russian** 4๊ฐœ ์–ธ์–ด๋กœ ๋™์‹œ์— ๋ฒˆ์—ญํ•ฉ๋‹ˆ๋‹ค.")
src4 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
aud4 = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="๐ŸŽค ๋…น์Œ ๋˜๋Š” ์˜ค๋””์˜ค ํŒŒ์ผ ์—…๋กœ๋“œ"
)
btn4 = gr.Button("๋ฒˆ์—ญ")
with gr.Row():
org4 = gr.Textbox(label="์›๋ฌธ", lines=4)
en4 = gr.Textbox(label="English", lines=4)
zh4 = gr.Textbox(label="Chinese (็ฎ€ไฝ“)", lines=4)
th4 = gr.Textbox(label="Thai", lines=4)
ru4 = gr.Textbox(label="Russian", lines=4)
btn4.click(
translate_audio_four,
inputs=[aud4, src4],
outputs=[org4, en4, zh4, th4, ru4]
)
# ============== ์•ฑ ์‹คํ–‰ =============================================
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True)