voice-trans / app.py
openfree's picture
Update app.py
e49bf8d verified
raw
history blame
6.93 kB
import gradio as gr
import openai, os, io, tempfile, wave, time
from dotenv import load_dotenv
# =============== ๊ณตํ†ต ์ดˆ๊ธฐํ™” ========================================
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY๋ฅผ .env ํŒŒ์ผ์— ์„ค์ •ํ•˜์„ธ์š”!")
client = openai.OpenAI(api_key=api_key)
# ---------- ์ง€์› ์–ธ์–ด -----------------------------------------------
LANGUAGES = [
"Korean", "English", "Japanese", "Chinese",
"Thai", "Russian", "Vietnamese",
"Spanish", "French"
]
LANG_CODE = {
"Korean": "ko", "English": "en", "Japanese": "ja", "Chinese": "zh",
"Thai": "th", "Russian": "ru", "Vietnamese": "vi",
"Spanish": "es", "French": "fr"
}
VOICE = {
lang: ("nova" if lang in ["Korean", "Japanese", "Chinese"] else "alloy")
for lang in LANGUAGES
}
# ---------- ๊ณตํ†ต ์œ ํ‹ธ -----------------------------------------------
def _gpt_translate(text: str, src: str, tgt: str) -> str:
rsp = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": (
f"You are a professional translator. Translate the following {src} text to {tgt}. "
f"Only provide the translation without additional commentary."
)
},
{"role": "user", "content": text}
],
temperature=0.3,
max_tokens=2048
)
return rsp.choices[0].message.content.strip()
def _tts(text: str, lang: str) -> str:
out = client.audio.speech.create(
model="tts-1",
voice=VOICE.get(lang, "alloy"),
input=text[:4096]
)
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tmp.write(out.content)
tmp.close()
return tmp.name
# =============== 1) ๋งˆ์ดํฌยทํŒŒ์ผ ๊ณตํ†ต ์ฒ˜๋ฆฌ ============================
def translate_audio(audio_path, src, tgt):
"""wav/mp3 ๊ฒฝ๋กœ -> (์›๋ฌธ, ๋ฒˆ์—ญ๋ฌธ, ๋ฒˆ์—ญ TTS ๊ฒฝ๋กœ)"""
with open(audio_path, "rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1",
file=f,
language=LANG_CODE.get(src)
)
original = stt.text.strip()
if not original:
return "โš ๏ธ ์Œ์„ฑ ์ธ์‹ ์‹คํŒจ", "", None
translated = _gpt_translate(original, src, tgt)
tts_path = _tts(translated, tgt)
return original, translated, tts_path
# =============== 2) ์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ(๋ฒ ํƒ€) ============================
STREAM_CHUNK_SEC = 4 # 4์ดˆ๋งˆ๋‹ค Whisper ํ˜ธ์ถœ
def stream_generator(mic_stream, src, tgt):
"""generator: ๋งค chunk๋งˆ๋‹ค yield (์›๋ฌธ๋ˆ„์ , ๋ฒˆ์—ญ๋ˆ„์ )"""
buffer = io.BytesIO()
wav_header = None
original_acc, translated_acc = "", ""
while True:
chunk = mic_stream.recv() # bytes
if chunk is None: # ์ŠคํŠธ๋ฆผ ์ข…๋ฃŒ
break
if not wav_header:
wav_header = chunk[:44] # WAV ํ—ค๋”(PCM 16kHz 16bit mono)
buffer.write(chunk)
if buffer.getbuffer().nbytes > 16000 * 2 * STREAM_CHUNK_SEC:
wav_bytes = wav_header + buffer.getvalue()
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(wav_bytes)
tmp.close()
o, t, _ = translate_audio(tmp.name, src, tgt)
original_acc += " " + o
translated_acc += " " + t
yield original_acc.strip(), translated_acc.strip()
buffer = io.BytesIO() # reset buffer
# ๋‚จ์€ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
if buffer.getbuffer().nbytes:
wav_bytes = wav_header + buffer.getvalue()
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(wav_bytes)
tmp.close()
o, t, _ = translate_audio(tmp.name, src, tgt)
yield (original_acc + " " + o).strip(), (translated_acc + " " + t).strip()
# =============== 3) 4๊ฐœ๊ตญ์–ด ๋™์‹œ ๋ฒˆ์—ญ ===============================
FOUR_LANGS = ["English", "Chinese", "Thai", "Russian"]
def translate_audio_four(audio_path, src):
"""ํ•œ ๋ฒˆ์˜ STT ํ›„ 4๊ฐœ ์–ธ์–ด(์˜/์ค‘/ํƒœ/๋Ÿฌ)๋กœ ๋™์‹œ ๋ฒˆ์—ญ"""
with open(audio_path, "rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1",
file=f,
language=LANG_CODE.get(src)
)
original = stt.text.strip()
if not original:
return ["โš ๏ธ ์Œ์„ฑ ์ธ์‹ ์‹คํŒจ"] + [""] * 4
outputs = [original]
for lang in FOUR_LANGS:
outputs.append(_gpt_translate(original, src, lang))
return outputs # ์ด 5๊ฐœ (์›๋ฌธ + 4์–ธ์–ด)
# =============== Gradio UI ==========================================
# โ€ฆ (์œ„์ชฝ ๊ณตํ†ต ์ดˆ๊ธฐํ™”/ํ•จ์ˆ˜ ๋™์ผ) โ€ฆ
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
with gr.Tabs():
# โ‘  ๋งˆ์ดํฌ + ํŒŒ์ผ ๋ฒˆ์—ญ โ† ์ˆ˜์ •
with gr.TabItem("๐ŸŽ™๏ธ ๋งˆ์ดํฌ/ํŒŒ์ผ ๋ฒˆ์—ญ"):
src1 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ")
tgt1 = gr.Dropdown(LANGUAGES, value="English", label="์ถœ๋ ฅ")
mic1 = gr.Audio(
sources=["microphone", "upload"], # โœ… ๋‘˜ ๋‹ค ํ—ˆ์šฉ
type="filepath",
label="๐ŸŽค ๋…น์Œ ๋˜๋Š” ์˜ค๋””์˜ค ํŒŒ์ผ ์—…๋กœ๋“œ"
)
btn1 = gr.Button("๋ฒˆ์—ญ")
stt1 = gr.Textbox(label="์›๋ฌธ", lines=5)
tlt1 = gr.Textbox(label="๋ฒˆ์—ญ", lines=5)
out1 = gr.Audio(label="TTS", type="filepath", autoplay=True)
btn1.click(
translate_audio,
inputs=[mic1, src1, tgt1],
outputs=[stt1, tlt1, out1]
)
# โ‘ก ์˜ค๋””์˜ค ํŒŒ์ผ ์ „์šฉ ๋ฒˆ์—ญ (๊ทธ๋Œ€๋กœ)
with gr.TabItem("๐ŸŽง ํŒŒ์ผ ์ „์šฉ ๋ฒˆ์—ญ"):
src2 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ")
tgt2 = gr.Dropdown(LANGUAGES, value="English", label="์ถœ๋ ฅ")
file2 = gr.Audio(
sources=["upload"],
type="filepath",
label="์˜ค๋””์˜ค ํŒŒ์ผ ์—…๋กœ๋“œ"
)
btn2 = gr.Button("๋ฒˆ์—ญ")
stt2 = gr.Textbox(label="์›๋ฌธ", lines=5)
tlt2 = gr.Textbox(label="๋ฒˆ์—ญ", lines=5)
out2 = gr.Audio(label="TTS", type="filepath", autoplay=True)
btn2.click(
translate_audio,
inputs=[file2, src2, tgt2],
outputs=[stt2, tlt2, out2]
)
# โ‘ข ์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ ๋ฒˆ์—ญ (Beta) โ€ฆ ์ด์ „๊ณผ ๋™์ผ โ€ฆ
# โ‘ฃ 4๊ฐœ ์–ธ์–ด ๋™์‹œ ๋ฒˆ์—ญ โ€ฆ ์ด์ „๊ณผ ๋™์ผ โ€ฆ
# ===================== ์‹คํ–‰ ==========================================
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True)