Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import openai, os, io, tempfile, wave, time | |
from dotenv import load_dotenv | |
# =============== ๊ณตํต ์ด๊ธฐํ ======================================== | |
load_dotenv() | |
api_key = os.getenv("OPENAI_API_KEY") | |
if not api_key: | |
raise RuntimeError("OPENAI_API_KEY๋ฅผ .env ํ์ผ์ ์ค์ ํ์ธ์!") | |
client = openai.OpenAI(api_key=api_key) | |
# ---------- ์ง์ ์ธ์ด ----------------------------------------------- | |
LANGUAGES = [ | |
"Korean", "English", "Japanese", "Chinese", | |
"Thai", "Russian", "Vietnamese", | |
"Spanish", "French" | |
] | |
LANG_CODE = { | |
"Korean": "ko", "English": "en", "Japanese": "ja", "Chinese": "zh", | |
"Thai": "th", "Russian": "ru", "Vietnamese": "vi", | |
"Spanish": "es", "French": "fr" | |
} | |
VOICE = { | |
lang: ("nova" if lang in ["Korean", "Japanese", "Chinese"] else "alloy") | |
for lang in LANGUAGES | |
} | |
# ---------- ๊ณตํต ์ ํธ ----------------------------------------------- | |
def _gpt_translate(text: str, src: str, tgt: str) -> str: | |
rsp = client.chat.completions.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{ | |
"role": "system", | |
"content": ( | |
f"You are a professional translator. Translate the following {src} text to {tgt}. " | |
f"Only provide the translation without additional commentary." | |
) | |
}, | |
{"role": "user", "content": text} | |
], | |
temperature=0.3, | |
max_tokens=2048 | |
) | |
return rsp.choices[0].message.content.strip() | |
def _tts(text: str, lang: str) -> str: | |
out = client.audio.speech.create( | |
model="tts-1", | |
voice=VOICE.get(lang, "alloy"), | |
input=text[:4096] | |
) | |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") | |
tmp.write(out.content) | |
tmp.close() | |
return tmp.name | |
# =============== 1) ๋ง์ดํฌยทํ์ผ ๊ณตํต ์ฒ๋ฆฌ ============================ | |
def translate_audio(audio_path, src, tgt): | |
"""wav/mp3 ๊ฒฝ๋ก -> (์๋ฌธ, ๋ฒ์ญ๋ฌธ, ๋ฒ์ญ TTS ๊ฒฝ๋ก)""" | |
with open(audio_path, "rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", | |
file=f, | |
language=LANG_CODE.get(src) | |
) | |
original = stt.text.strip() | |
if not original: | |
return "โ ๏ธ ์์ฑ ์ธ์ ์คํจ", "", None | |
translated = _gpt_translate(original, src, tgt) | |
tts_path = _tts(translated, tgt) | |
return original, translated, tts_path | |
# =============== 2) ์ค์๊ฐ ์คํธ๋ฆฌ๋ฐ(๋ฒ ํ) ============================ | |
STREAM_CHUNK_SEC = 4 # 4์ด๋ง๋ค Whisper ํธ์ถ | |
def stream_generator(mic_stream, src, tgt): | |
"""generator: ๋งค chunk๋ง๋ค yield (์๋ฌธ๋์ , ๋ฒ์ญ๋์ )""" | |
buffer = io.BytesIO() | |
wav_header = None | |
original_acc, translated_acc = "", "" | |
while True: | |
chunk = mic_stream.recv() # bytes | |
if chunk is None: # ์คํธ๋ฆผ ์ข ๋ฃ | |
break | |
if not wav_header: | |
wav_header = chunk[:44] # WAV ํค๋(PCM 16kHz 16bit mono) | |
buffer.write(chunk) | |
if buffer.getbuffer().nbytes > 16000 * 2 * STREAM_CHUNK_SEC: | |
wav_bytes = wav_header + buffer.getvalue() | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
tmp.write(wav_bytes) | |
tmp.close() | |
o, t, _ = translate_audio(tmp.name, src, tgt) | |
original_acc += " " + o | |
translated_acc += " " + t | |
yield original_acc.strip(), translated_acc.strip() | |
buffer = io.BytesIO() # reset buffer | |
# ๋จ์ ๋ฐ์ดํฐ ์ฒ๋ฆฌ | |
if buffer.getbuffer().nbytes: | |
wav_bytes = wav_header + buffer.getvalue() | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
tmp.write(wav_bytes) | |
tmp.close() | |
o, t, _ = translate_audio(tmp.name, src, tgt) | |
yield (original_acc + " " + o).strip(), (translated_acc + " " + t).strip() | |
# =============== 3) 4๊ฐ๊ตญ์ด ๋์ ๋ฒ์ญ =============================== | |
FOUR_LANGS = ["English", "Chinese", "Thai", "Russian"] | |
def translate_audio_four(audio_path, src): | |
"""ํ ๋ฒ์ STT ํ 4๊ฐ ์ธ์ด(์/์ค/ํ/๋ฌ)๋ก ๋์ ๋ฒ์ญ""" | |
with open(audio_path, "rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", | |
file=f, | |
language=LANG_CODE.get(src) | |
) | |
original = stt.text.strip() | |
if not original: | |
return ["โ ๏ธ ์์ฑ ์ธ์ ์คํจ"] + [""] * 4 | |
outputs = [original] | |
for lang in FOUR_LANGS: | |
outputs.append(_gpt_translate(original, src, lang)) | |
return outputs # ์ด 5๊ฐ (์๋ฌธ + 4์ธ์ด) | |
# =============== Gradio UI ========================================== | |
# โฆ (์์ชฝ ๊ณตํต ์ด๊ธฐํ/ํจ์ ๋์ผ) โฆ | |
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app: | |
with gr.Tabs(): | |
# โ ๋ง์ดํฌ + ํ์ผ ๋ฒ์ญ โ ์์ | |
with gr.TabItem("๐๏ธ ๋ง์ดํฌ/ํ์ผ ๋ฒ์ญ"): | |
src1 = gr.Dropdown(LANGUAGES, value="Korean", label="์ ๋ ฅ") | |
tgt1 = gr.Dropdown(LANGUAGES, value="English", label="์ถ๋ ฅ") | |
mic1 = gr.Audio( | |
sources=["microphone", "upload"], # โ ๋ ๋ค ํ์ฉ | |
type="filepath", | |
label="๐ค ๋ น์ ๋๋ ์ค๋์ค ํ์ผ ์ ๋ก๋" | |
) | |
btn1 = gr.Button("๋ฒ์ญ") | |
stt1 = gr.Textbox(label="์๋ฌธ", lines=5) | |
tlt1 = gr.Textbox(label="๋ฒ์ญ", lines=5) | |
out1 = gr.Audio(label="TTS", type="filepath", autoplay=True) | |
btn1.click( | |
translate_audio, | |
inputs=[mic1, src1, tgt1], | |
outputs=[stt1, tlt1, out1] | |
) | |
# โก ์ค๋์ค ํ์ผ ์ ์ฉ ๋ฒ์ญ (๊ทธ๋๋ก) | |
with gr.TabItem("๐ง ํ์ผ ์ ์ฉ ๋ฒ์ญ"): | |
src2 = gr.Dropdown(LANGUAGES, value="Korean", label="์ ๋ ฅ") | |
tgt2 = gr.Dropdown(LANGUAGES, value="English", label="์ถ๋ ฅ") | |
file2 = gr.Audio( | |
sources=["upload"], | |
type="filepath", | |
label="์ค๋์ค ํ์ผ ์ ๋ก๋" | |
) | |
btn2 = gr.Button("๋ฒ์ญ") | |
stt2 = gr.Textbox(label="์๋ฌธ", lines=5) | |
tlt2 = gr.Textbox(label="๋ฒ์ญ", lines=5) | |
out2 = gr.Audio(label="TTS", type="filepath", autoplay=True) | |
btn2.click( | |
translate_audio, | |
inputs=[file2, src2, tgt2], | |
outputs=[stt2, tlt2, out2] | |
) | |
# โข ์ค์๊ฐ ์คํธ๋ฆฌ๋ฐ ๋ฒ์ญ (Beta) โฆ ์ด์ ๊ณผ ๋์ผ โฆ | |
# โฃ 4๊ฐ ์ธ์ด ๋์ ๋ฒ์ญ โฆ ์ด์ ๊ณผ ๋์ผ โฆ | |
# ===================== ์คํ ========================================== | |
if __name__ == "__main__": | |
app.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True) | |