voice-trans / app.py
openfree's picture
Update app.py
effad1c verified
raw
history blame
11.7 kB
"""
SMARTok ํ•ต์‹ฌ ๋ฐ๋ชจ
โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
โœ“ ํƒญ 1 : ์˜ค๋””์˜ค(๋…น์ŒยทํŒŒ์ผ) ๋‹จ๊ฑด ๋ฒˆ์—ญ + TTS ์žฌ์ƒ
โœ“ ํƒญ 2 : PDF / ์ด๋ฏธ์ง€ ๋ฒˆ์—ญ (Tesseract ์—†์œผ๋ฉด PDF๋งŒ ์•ˆ๋‚ด)
โœ“ ํƒญ 3 : ์‹ค์‹œ๊ฐ„ 1๊ฐœ ์–ธ์–ด(์„ ํƒํ˜•) ๋ฒˆ์—ญ
โœ“ ํƒญ 4 : ์‹ค์‹œ๊ฐ„ 4๊ฐœ ์–ธ์–ด(์˜ยท์ค‘(๊ฐ„)ยทํƒœยท๋Ÿฌ) ๋™์‹œ ๋ฒˆ์—ญ
โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
Python โ‰ฅ3.10, Gradio 4.x, OpenAI Python SDK ํ•„์š”
"""
import gradio as gr
import openai, os, io, tempfile, mimetypes
from dotenv import load_dotenv
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 0. ๊ณตํ†ต ์ดˆ๊ธฐํ™” โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY๋ฅผ .env ํŒŒ์ผ์— ์„ค์ •ํ•˜์„ธ์š”!")
client = openai.OpenAI(api_key=api_key)
LANGUAGES = [
"Korean", "English", "Japanese", "Chinese",
"Thai", "Russian", "Vietnamese",
"Spanish", "French"
]
LANG_CODE = {
"Korean":"ko","English":"en","Japanese":"ja","Chinese":"zh",
"Thai":"th","Russian":"ru","Vietnamese":"vi",
"Spanish":"es","French":"fr"
}
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
for l in LANGUAGES}
FOUR_LANGS = ["English", "Chinese", "Thai", "Russian"] # ์‹ค์‹œ๊ฐ„ ๋™์‹œ ๋ฒˆ์—ญ์šฉ
STREAM_SEC = 4 # Whisper ํ˜ธ์ถœ ๊ฐ„๊ฒฉ(์ดˆ) โ€“ 3~4 ์ดˆ ์ •๋„ ์ง€์—ฐ
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 1. ์œ ํ‹ธ ํ•จ์ˆ˜ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _safe_path(v):
"""Gradio File/Audio ์ž…๋ ฅ โ†’ ์‹ค์ œ ๊ฒฝ๋กœ ์ถ”์ถœ"""
if v is None:
return None
return v.get("name") if isinstance(v, dict) else v
def _gpt_translate(text: str, src: str, tgt: str) -> str:
"""GPT-3.5-turbo ๋ฒˆ์—ญ (์„ค๋ช… ์—†์ด ๊ฒฐ๊ณผ๋งŒ)"""
rsp = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role":"system",
"content":f"You are a professional translator. Translate the following {src} text to {tgt}. "
f"Only provide the translated text."},
{"role":"user","content":text}
],
temperature=0.3,max_tokens=4096
)
return rsp.choices[0].message.content.strip()
def _tts(text: str, lang: str) -> str:
"""OpenAI TTS-1 โ‡’ MP3 ํŒŒ์ผ ๊ฒฝ๋กœ ๋ฐ˜ํ™˜"""
out = client.audio.speech.create(
model="tts-1",
voice=VOICE.get(lang,"alloy"),
input=text[:4096]
)
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tmp.write(out.content)
tmp.close()
return tmp.name
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 2. ๋‹จ๊ฑด ์˜ค๋””์˜ค ๋ฒˆ์—ญ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def translate_audio(audio_in, src, tgt):
path = _safe_path(audio_in)
if not path or not os.path.exists(path):
return "โš ๏ธ ์Œ์„ฑ ํŒŒ์ผ์„ ๋…น์Œ-์—…๋กœ๋“œํ•˜์„ธ์š”.", "", None
with open(path,"rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1",
file=f,
language=LANG_CODE.get(src)
)
original = stt.text.strip()
if not original:
return "โš ๏ธ ์Œ์„ฑ ์ธ์‹ ์‹คํŒจ", "", None
translated = _gpt_translate(original, src, tgt)
tts_path = _tts(translated, tgt)
return original, translated, tts_path
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 3. PDF / ์ด๋ฏธ์ง€ ๋ฒˆ์—ญ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def translate_document(file_in, src, tgt):
path = _safe_path(file_in)
if not path or not os.path.exists(path):
return "โš ๏ธ PDF(๋˜๋Š” ์ด๋ฏธ์ง€) ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜์„ธ์š”.", ""
ext = os.path.splitext(path)[1].lower()
mime = mimetypes.guess_type(path)[0] or ""
text = ""
try:
if ext == ".pdf" or "pdf" in mime:
import pdfplumber
with pdfplumber.open(path) as pdf:
pages = pdf.pages[:5] # ๋ฐ๋ชจ: 5์ชฝ ์ œํ•œ
text = "\n".join(p.extract_text() or "" for p in pages)
else:
# ์ด๋ฏธ์ง€์˜ ๊ฒฝ์šฐ Tesseract ํ•„์š”
try:
from PIL import Image
import pytesseract
text = pytesseract.image_to_string(Image.open(path))
except Exception:
return "โš ๏ธ ์„œ๋ฒ„์— Tesseract OCR๊ฐ€ ์—†์–ด์„œ ์ด๋ฏธ์ง€ OCR์€ ์ง€์›๋˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค. PDF๋งŒ ์‚ฌ์šฉํ•˜์„ธ์š”.", ""
except Exception as e:
return f"โŒ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ: {e}", ""
text = text.strip()
if not text:
return "โš ๏ธ ํ…์ŠคํŠธ๊ฐ€ ์ถ”์ถœ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.", ""
translated = _gpt_translate(text, src, tgt)
return text, translated
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 4. ์‹ค์‹œ๊ฐ„ 1๊ฐœ ์–ธ์–ด ๋ฒˆ์—ญ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def stream_single(mic_stream, src, tgt):
buf, header = io.BytesIO(), None
o_acc, t_acc = "", ""
while True:
chunk = mic_stream.recv()
if chunk is None:
break
if header is None:
header = chunk[:44]
buf.write(chunk)
if buf.getbuffer().nbytes > 16000*2*STREAM_SEC:
wav = header + buf.getvalue()
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp:
tmp.write(wav); tmp.close()
o, t, _ = translate_audio(tmp.name, src, tgt)
o_acc += " " + o
t_acc += " " + t
yield o_acc.strip(), t_acc.strip()
buf = io.BytesIO()
if buf.getbuffer().nbytes:
wav = header + buf.getvalue()
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp:
tmp.write(wav); tmp.close()
o, t, _ = translate_audio(tmp.name, src, tgt)
yield (o_acc+" "+o).strip(), (t_acc+" "+t).strip()
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 5. ์‹ค์‹œ๊ฐ„ 4๊ฐœ ์–ธ์–ด ๋ฒˆ์—ญ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def stream_multi(mic_stream, src):
buf, header = io.BytesIO(), None
acc = {lang:"" for lang in ["orig"]+FOUR_LANGS}
while True:
chunk = mic_stream.recv()
if chunk is None:
break
if header is None:
header = chunk[:44]
buf.write(chunk)
if buf.getbuffer().nbytes > 16000*2*STREAM_SEC:
wav = header + buf.getvalue()
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp:
tmp.write(wav); tmp.close()
with open(tmp.name,"rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1", file=f,
language=LANG_CODE.get(src)
)
orig = stt.text.strip()
if orig:
acc["orig"] += " " + orig
for lang in FOUR_LANGS:
acc[lang] += " " + _gpt_translate(orig, src, lang)
yield (acc["orig"].strip(),
acc["English"].strip(),
acc["Chinese"].strip(),
acc["Thai"].strip(),
acc["Russian"].strip())
buf = io.BytesIO()
if buf.getbuffer().nbytes:
wav = header + buf.getvalue()
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp:
tmp.write(wav); tmp.close()
with open(tmp.name,"rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1", file=f,
language=LANG_CODE.get(src)
)
orig = stt.text.strip()
if orig:
acc["orig"] += " " + orig
for lang in FOUR_LANGS:
acc[lang] += " " + _gpt_translate(orig, src, lang)
yield (acc["orig"].strip(),
acc["English"].strip(),
acc["Chinese"].strip(),
acc["Thai"].strip(),
acc["Russian"].strip())
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 6. Gradio UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
with gr.Tabs():
# ํƒญ 1 โ”€ ์˜ค๋””์˜ค ๋ฒˆ์—ญ
with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค ๋ฒˆ์—ญ"):
src1 = gr.Dropdown(LANGUAGES,value="Korean",label="์ž…๋ ฅ")
tgt1 = gr.Dropdown(LANGUAGES,value="English",label="์ถœ๋ ฅ")
aud1 = gr.Audio(sources=["microphone","upload"],
type="filepath",
label="๋…น์Œ ๋˜๋Š” ์˜ค๋””์˜ค ํŒŒ์ผ ์—…๋กœ๋“œ")
btn1 = gr.Button("๋ฒˆ์—ญ")
stt1 = gr.Textbox(label="์›๋ฌธ", lines=5)
tlt1 = gr.Textbox(label="๋ฒˆ์—ญ", lines=5)
out1 = gr.Audio(label="TTS",type="filepath",autoplay=True)
btn1.click(translate_audio,[aud1,src1,tgt1],[stt1,tlt1,out1])
# ํƒญ 2 โ”€ ๋ฌธ์„œ/์ด๋ฏธ์ง€ ๋ฒˆ์—ญ
with gr.TabItem("๐Ÿ“„ ๋ฌธ์„œ/์ด๋ฏธ์ง€ ๋ฒˆ์—ญ"):
src2 = gr.Dropdown(LANGUAGES,value="Korean",label="์ž…๋ ฅ")
tgt2 = gr.Dropdown(LANGUAGES,value="English",label="์ถœ๋ ฅ")
file2= gr.File(label="PDF ๋˜๋Š” ์ด๋ฏธ์ง€ ์—…๋กœ๋“œ",
file_types=[".pdf",".png",".jpg",".jpeg",".bmp",".tiff",".gif"])
btn2 = gr.Button("๋ฒˆ์—ญ")
org2 = gr.Textbox(label="์ถ”์ถœ ์›๋ฌธ",lines=15)
trs2 = gr.Textbox(label="๋ฒˆ์—ญ ๊ฒฐ๊ณผ",lines=15)
btn2.click(translate_document,[file2,src2,tgt2],[org2,trs2])
# ํƒญ 3 โ”€ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด ๋ฒˆ์—ญ
with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด"):
src3 = gr.Dropdown(LANGUAGES,value="Korean",label="์ž…๋ ฅ")
tgt3 = gr.Dropdown(LANGUAGES,value="English",label="์ถœ๋ ฅ")
mic3 = gr.Audio(sources=["microphone"],
streaming=True,
label="์‹ค์‹œ๊ฐ„ ๋งˆ์ดํฌ")
stt3 = gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)",lines=8)
tlt3 = gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)",lines=8)
mic3.stream(stream_single,inputs=[src3,tgt3],outputs=[stt3,tlt3])
# ํƒญ 4 โ”€ ์‹ค์‹œ๊ฐ„ 4๊ฐœ ์–ธ์–ด ๋ฒˆ์—ญ
with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4๊ฐœ ์–ธ์–ด"):
gr.Markdown("๋งˆ์ดํฌ ์ž…๋ ฅ์„ 3-4 ์ดˆ ๊ฐ„๊ฒฉ์œผ๋กœ **English / Chinese(็ฎ€ไฝ“) / Thai / Russian** 4๊ฐœ ์–ธ์–ด๋กœ ๋™์‹œ ๋ฒˆ์—ญํ•ฉ๋‹ˆ๋‹ค.")
src4 = gr.Dropdown(LANGUAGES,value="Korean",label="์ž…๋ ฅ ์–ธ์–ด")
mic4 = gr.Audio(sources=["microphone"],
streaming=True,
label="์‹ค์‹œ๊ฐ„ ๋งˆ์ดํฌ")
o4 = gr.Textbox(label="์›๋ฌธ",lines=8)
e4 = gr.Textbox(label="English",lines=8)
z4 = gr.Textbox(label="Chinese(็ฎ€ไฝ“)",lines=8)
t4 = gr.Textbox(label="Thai",lines=8)
r4 = gr.Textbox(label="Russian",lines=8)
mic4.stream(stream_multi,inputs=[src4],
outputs=[o4,e4,z4,t4,r4])
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 7. ์‹คํ–‰ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
if __name__ == "__main__":
app.launch(server_name="0.0.0.0",
server_port=7860,
share=False,
debug=True)