Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import openai, os, io, tempfile | |
from dotenv import load_dotenv | |
# ============== ํ๊ฒฝ ๋ณ์ & OpenAI ์ด๊ธฐํ =========================== | |
load_dotenv() | |
api_key = os.getenv("OPENAI_API_KEY") | |
if not api_key: | |
raise RuntimeError("OPENAI_API_KEY๋ฅผ .env ํ์ผ์ ์ค์ ํ์ธ์!") | |
client = openai.OpenAI(api_key=api_key) | |
# ============== ์ธ์ด ์ค์ =========================================== | |
LANGUAGES = [ | |
"Korean", "English", "Japanese", "Chinese", | |
"Thai", "Russian", "Vietnamese", | |
"Spanish", "French" | |
] | |
LANG_CODE = { | |
"Korean": "ko", "English": "en", "Japanese": "ja", "Chinese": "zh", | |
"Thai": "th", "Russian": "ru", "Vietnamese": "vi", | |
"Spanish": "es", "French": "fr" | |
} | |
VOICE = { | |
lang: ("nova" if lang in ["Korean", "Japanese", "Chinese"] else "alloy") | |
for lang in LANGUAGES | |
} | |
# ============== ๊ณตํต ํจ์ =========================================== | |
def _gpt_translate(text: str, src: str, tgt: str) -> str: | |
"""GPT-3.5 ๋ฒ์ญ""" | |
rsp = client.chat.completions.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{ | |
"role": "system", | |
"content": ( | |
f"You are a professional translator. Translate the following {src} text to {tgt}. " | |
f"Only provide the translation without additional commentary." | |
) | |
}, | |
{"role": "user", "content": text} | |
], | |
temperature=0.3, | |
max_tokens=2048 | |
) | |
return rsp.choices[0].message.content.strip() | |
def _tts(text: str, lang: str) -> str: | |
"""TTS-1 ์์ฑ ํฉ์ฑ โ ์์ mp3 ๊ฒฝ๋ก ๋ฐํ""" | |
out = client.audio.speech.create( | |
model="tts-1", | |
voice=VOICE.get(lang, "alloy"), | |
input=text[:4096] | |
) | |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") | |
tmp.write(out.content) | |
tmp.close() | |
return tmp.name | |
def translate_audio(audio_path, src, tgt): | |
"""๋จ์ผ ์์ฑ ํ์ผ ๋ฒ์ญ(TTS ํฌํจ)""" | |
with open(audio_path, "rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", | |
file=f, | |
language=LANG_CODE.get(src) | |
) | |
original = stt.text.strip() | |
if not original: | |
return "โ ๏ธ ์์ฑ ์ธ์ ์คํจ", "", None | |
translated = _gpt_translate(original, src, tgt) | |
tts_path = _tts(translated, tgt) | |
return original, translated, tts_path | |
# ============== ์ค์๊ฐ ์คํธ๋ฆฌ๋ฐ ์ ์ฌ/๋ฒ์ญ ============================ | |
STREAM_CHUNK_SEC = 4 # Whisper ํธ์ถ ์ฃผ๊ธฐ(์ด) | |
def stream_generator(mic_stream, src, tgt): | |
"""๋ง์ดํฌ ์คํธ๋ฆผ -> ์ฃผ๊ธฐ์ ์ฒญํฌ ๋ฒ์ญ(๋์ ์ถ๋ ฅ)""" | |
buffer = io.BytesIO() | |
wav_header = None | |
original_acc, translated_acc = "", "" | |
while True: | |
chunk = mic_stream.recv() | |
if chunk is None: # ์คํธ๋ฆผ ์ข ๋ฃ | |
break | |
if not wav_header: | |
wav_header = chunk[:44] # WAV ํค๋(16kHz 16-bit mono) | |
buffer.write(chunk) | |
# ์ง์ ์๊ฐ๋งํผ ์์ด๋ฉด Whisper ํธ์ถ | |
if buffer.getbuffer().nbytes > 16000 * 2 * STREAM_CHUNK_SEC: | |
wav_bytes = wav_header + buffer.getvalue() | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
tmp.write(wav_bytes) | |
tmp.close() | |
o, t, _ = translate_audio(tmp.name, src, tgt) | |
original_acc += " " + o | |
translated_acc += " " + t | |
yield original_acc.strip(), translated_acc.strip() | |
buffer = io.BytesIO() # ๋ฒํผ ์ด๊ธฐํ | |
# ๋ง์ง๋ง ๋จ์ ๋ฒํผ ์ฒ๋ฆฌ | |
if buffer.getbuffer().nbytes: | |
wav_bytes = wav_header + buffer.getvalue() | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
tmp.write(wav_bytes) | |
tmp.close() | |
o, t, _ = translate_audio(tmp.name, src, tgt) | |
yield (original_acc + " " + o).strip(), (translated_acc + " " + t).strip() | |
# ============== 4๊ฐ๊ตญ ๋์ ๋ฒ์ญ ===================================== | |
FOUR_LANGS = ["English", "Chinese", "Thai", "Russian"] | |
def translate_audio_four(audio_path, src): | |
"""์์ฑ ํ์ผ โ ์๋ฌธ + 4๊ฐ ์ธ์ด ๋์ ๋ฒ์ญ""" | |
with open(audio_path, "rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", | |
file=f, | |
language=LANG_CODE.get(src) | |
) | |
original = stt.text.strip() | |
if not original: | |
return ["โ ๏ธ ์์ฑ ์ธ์ ์คํจ"] + [""] * 4 | |
outs = [original] | |
for lang in FOUR_LANGS: | |
outs.append(_gpt_translate(original, src, lang)) | |
return outs # ์ด 5๊ฐ(์๋ฌธ+4์ธ์ด) | |
# ============== Gradio UI =========================================== | |
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app: | |
with gr.Tabs(): | |
# โ ๋ง์ดํฌ/ํ์ผ ๋ฒ์ญ | |
with gr.TabItem("๐๏ธ ๋ง์ดํฌ/ํ์ผ ๋ฒ์ญ"): | |
src1 = gr.Dropdown(LANGUAGES, value="Korean", label="์ ๋ ฅ") | |
tgt1 = gr.Dropdown(LANGUAGES, value="English", label="์ถ๋ ฅ") | |
mic1 = gr.Audio( | |
sources=["microphone", "upload"], | |
type="filepath", | |
label="๐ค ๋ น์ ๋๋ ์ค๋์ค ํ์ผ ์ ๋ก๋" | |
) | |
btn1 = gr.Button("๋ฒ์ญ") | |
stt1 = gr.Textbox(label="์๋ฌธ", lines=5) | |
tlt1 = gr.Textbox(label="๋ฒ์ญ", lines=5) | |
out1 = gr.Audio(label="TTS", type="filepath", autoplay=True) | |
btn1.click( | |
translate_audio, | |
inputs=[mic1, src1, tgt1], | |
outputs=[stt1, tlt1, out1] | |
) | |
# โก ํ์ผ ์ ์ฉ ๋ฒ์ญ | |
with gr.TabItem("๐ง ํ์ผ ์ ์ฉ ๋ฒ์ญ"): | |
src2 = gr.Dropdown(LANGUAGES, value="Korean", label="์ ๋ ฅ") | |
tgt2 = gr.Dropdown(LANGUAGES, value="English", label="์ถ๋ ฅ") | |
file2 = gr.Audio( | |
sources=["upload"], | |
type="filepath", | |
label="์ค๋์ค ํ์ผ ์ ๋ก๋" | |
) | |
btn2 = gr.Button("๋ฒ์ญ") | |
stt2 = gr.Textbox(label="์๋ฌธ", lines=5) | |
tlt2 = gr.Textbox(label="๋ฒ์ญ", lines=5) | |
out2 = gr.Audio(label="TTS", type="filepath", autoplay=True) | |
btn2.click( | |
translate_audio, | |
inputs=[file2, src2, tgt2], | |
outputs=[stt2, tlt2, out2] | |
) | |
# โข ์ค์๊ฐ ์คํธ๋ฆฌ๋ฐ ๋ฒ์ญ(Beta) | |
with gr.TabItem("โฑ๏ธ ์ค์๊ฐ ๋ฒ์ญ (Beta)"): | |
gr.Markdown("๋ง์ดํฌ๋ฅผ ์ผ๋ฉด 3~4์ด ๊ฐ๊ฒฉ์ผ๋ก ์๋ง์ด ๊ฐฑ์ ๋ฉ๋๋ค.") | |
src3 = gr.Dropdown(LANGUAGES, value="Korean", label="์ ๋ ฅ") | |
tgt3 = gr.Dropdown(LANGUAGES, value="English", label="์ถ๋ ฅ") | |
mic3 = gr.Audio( | |
sources=["microphone"], | |
streaming=True, | |
label="๐ค ์ค์๊ฐ ๋ง์ดํฌ ์ ๋ ฅ" | |
) | |
stt3 = gr.Textbox(label="์๋ฌธ(์ค์๊ฐ)", lines=8) | |
tlt3 = gr.Textbox(label="๋ฒ์ญ(์ค์๊ฐ)", lines=8) | |
def gen(audio, src_lang, tgt_lang): | |
yield from stream_generator(audio, src_lang, tgt_lang) | |
mic3.stream(gen, inputs=[src3, tgt3], outputs=[stt3, tlt3]) | |
# โฃ 4๊ฐ ์ธ์ด ๋์ ๋ฒ์ญ | |
with gr.TabItem("๐ 4๊ฐ ์ธ์ด ๋์"): | |
gr.Markdown("์ ๋ ฅ ์์ฑ์ **English / Chinese(็ฎไฝ) / Thai / Russian** 4๊ฐ ์ธ์ด๋ก ๋์์ ๋ฒ์ญํฉ๋๋ค.") | |
src4 = gr.Dropdown(LANGUAGES, value="Korean", label="์ ๋ ฅ ์ธ์ด") | |
aud4 = gr.Audio( | |
sources=["microphone", "upload"], | |
type="filepath", | |
label="๐ค ๋ น์ ๋๋ ์ค๋์ค ํ์ผ ์ ๋ก๋" | |
) | |
btn4 = gr.Button("๋ฒ์ญ") | |
with gr.Row(): | |
org4 = gr.Textbox(label="์๋ฌธ", lines=4) | |
en4 = gr.Textbox(label="English", lines=4) | |
zh4 = gr.Textbox(label="Chinese (็ฎไฝ)", lines=4) | |
th4 = gr.Textbox(label="Thai", lines=4) | |
ru4 = gr.Textbox(label="Russian", lines=4) | |
btn4.click( | |
translate_audio_four, | |
inputs=[aud4, src4], | |
outputs=[org4, en4, zh4, th4, ru4] | |
) | |
# ============== ์ฑ ์คํ ============================================= | |
if __name__ == "__main__": | |
app.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True) | |