voice-trans / app.py
openfree's picture
Update app.py
6b6f26e verified
raw
history blame
9.52 kB
"""
SMARTok ์‹ค์‹œ๊ฐ„ ๋‹ค๊ตญ์–ด ๋ฐ๋ชจ (์™„์ „ ์ˆ˜์ •๋ณธ)
โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
โ€ข ํƒญ1 ๐ŸŽ™๏ธ ์˜ค๋””์˜ค ๋ฒˆ์—ญ : ๋งˆ์ดํฌ/ํŒŒ์ผ โ†’ ๋ฒˆ์—ญ + TTS
โ€ข ํƒญ2 ๐Ÿ“„ ๋ฌธ์„œยท์ด๋ฏธ์ง€ ๋ฒˆ์—ญ : PDF / ์ด๋ฏธ์ง€(OCR) โ†’ ๋ฒˆ์—ญ
โ€ข ํƒญ3 โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด ๋ฒˆ์—ญ : ๋งˆ์ดํฌ โ†’ 1๊ฐœ ์–ธ์–ด ์‹ค์‹œ๊ฐ„ ์ž๋ง‰
โ€ข ํƒญ4 ๐ŸŒ ์‹ค์‹œ๊ฐ„ 4๊ฐœ ์–ธ์–ด ๋ฒˆ์—ญ : ๋งˆ์ดํฌ โ†’ ์˜ยท์ค‘ยทํƒœยท๋Ÿฌ ๋™์‹œ ์ž๋ง‰
โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
ํ•„์ˆ˜ apt : tesseract-ocr libtesseract-dev ocrmypdf ffmpeg
ํ•„์ˆ˜ pip : gradio>=5.33 openai python-dotenv pdfplumber ocrmypdf pillow
"""
import gradio as gr
import openai, os, io, tempfile, mimetypes, json, uuid
from dotenv import load_dotenv
import pdfplumber, ocrmypdf
from PIL import Image
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 0. ๊ณตํ†ต ์ดˆ๊ธฐํ™” โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY๋ฅผ .env ํŒŒ์ผ์— ์„ค์ •ํ•˜์„ธ์š”!")
client = openai.OpenAI(api_key=api_key)
LANGUAGES = [
"Korean", "English", "Japanese", "Chinese",
"Thai", "Russian", "Vietnamese",
"Spanish", "French"
]
LANG_CODE = {
"Korean":"ko","English":"en","Japanese":"ja","Chinese":"zh",
"Thai":"th","Russian":"ru","Vietnamese":"vi",
"Spanish":"es","French":"fr"
}
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
for l in LANGUAGES}
FOUR = ["English","Chinese","Thai","Russian"]
STREAM_SEC = 4 # Whisper ํ˜ธ์ถœ ์ฃผ๊ธฐ
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 1. ์œ ํ‹ธ ํ•จ์ˆ˜ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _safe(v):
if v is None:
return None
return v["name"] if isinstance(v, dict) else v
def _gpt(text, src, tgt):
rsp = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role":"system",
"content":f"Translate the following {src} text to {tgt}. "
"Return only the translation."},
{"role":"user","content":text}
],
temperature=0.3,max_tokens=4096
)
return rsp.choices[0].message.content.strip()
def _tts(text, lang):
rsp = client.audio.speech.create(
model="tts-1",
voice=VOICE.get(lang,"alloy"),
input=text[:4096]
)
tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
tmp.write(rsp.content); tmp.close()
return tmp.name
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 2. ์˜ค๋””์˜ค(๋‹จ๊ฑด) ๋ฒˆ์—ญ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def translate_audio(audio_in, src, tgt):
p = _safe(audio_in)
if not p or not os.path.exists(p):
return "โš ๏ธ ์Œ์„ฑ ํŒŒ์ผ ํ•„์š”", "", None
with open(p,"rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1", file=f, language=LANG_CODE.get(src))
orig = stt.text.strip()
if not orig:
return "โš ๏ธ ์Œ์„ฑ ์ธ์‹ ์‹คํŒจ", "", None
trans = _gpt(orig, src, tgt)
return orig, trans, _tts(trans, tgt)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 3. ๋ฌธ์„œ / ์ด๋ฏธ์ง€ ๋ฒˆ์—ญ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def translate_doc(file_in, src, tgt):
p = _safe(file_in)
if not p or not os.path.exists(p):
return "โš ๏ธ PDF/์ด๋ฏธ์ง€ ์—…๋กœ๋“œ", ""
ext = os.path.splitext(p)[1].lower()
mime = mimetypes.guess_type(p)[0] or ""
try:
# PDF ๊ทธ๋Œ€๋กœ
if ext==".pdf" or "pdf" in mime:
with pdfplumber.open(p) as pdf:
txt = "\n".join(pg.extract_text() or "" for pg in pdf.pages[:5])
# ์ด๋ฏธ์ง€ โ†’ OCR PDF
else:
img_pdf = tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name
Image.open(p).save(img_pdf,"PDF")
ocr_pdf = tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name
ocrmypdf.ocr(img_pdf, ocr_pdf,
lang=LANG_CODE.get(src,"eng"),
deskew=True,optimize=0,progress_bar=False)
with pdfplumber.open(ocr_pdf) as pdf:
txt = "\n".join(pg.extract_text() or "" for pg in pdf.pages)
except Exception as e:
return f"โŒ ์ถ”์ถœ ์˜ค๋ฅ˜: {e}", ""
txt = txt.strip()
if not txt:
return "โš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ", ""
return txt, _gpt(txt, src, tgt)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 4. ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def stream_single(audio_path, src, tgt, state):
state = state or {"orig":"", "trans":""}
if not audio_path or not os.path.exists(audio_path):
return state["orig"], state["trans"], state
with open(audio_path,"rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1", file=f, language=LANG_CODE.get(src))
full = stt.text.strip()
new = full[len(state["orig"]):]
if new:
state["orig"] = full
state["trans"] += " " + _gpt(new, src, tgt)
return state["orig"], state["trans"].strip(), state
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 5. ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def stream_multi(audio_path, src, state):
state = state or {k:"" for k in ["orig"]+FOUR}
if not audio_path or not os.path.exists(audio_path):
return state["orig"],state["English"],state["Chinese"],state["Thai"],state["Russian"],state
with open(audio_path,"rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1", file=f, language=LANG_CODE.get(src))
full = stt.text.strip()
new = full[len(state["orig"]):]
if new:
state["orig"] = full
for lang in FOUR:
state[lang] += " " + _gpt(new, src, lang)
return (state["orig"].strip(),
state["English"].strip(),
state["Chinese"].strip(),
state["Thai"].strip(),
state["Russian"].strip(),
state)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 6. Gradio UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
with gr.Tabs():
# ํƒญ 1
with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค ๋ฒˆ์—ญ"):
src1 = gr.Dropdown(LANGUAGES,value="Korean",label="์ž…๋ ฅ ์–ธ์–ด")
tgt1 = gr.Dropdown(LANGUAGES,value="English",label="์ถœ๋ ฅ ์–ธ์–ด")
aud1 = gr.Audio(sources=["microphone","upload"],type="filepath")
res1 = gr.Button("๋ฒˆ์—ญ")
o1 = gr.Textbox(label="์›๋ฌธ",lines=5)
t1 = gr.Textbox(label="๋ฒˆ์—ญ",lines=5)
a1 = gr.Audio(label="TTS",type="filepath",autoplay=True)
res1.click(translate_audio,[aud1,src1,tgt1],[o1,t1,a1])
# ํƒญ 2
with gr.TabItem("๐Ÿ“„ ๋ฌธ์„œยท์ด๋ฏธ์ง€ ๋ฒˆ์—ญ"):
src2 = gr.Dropdown(LANGUAGES,value="Korean",label="์ž…๋ ฅ ์–ธ์–ด")
tgt2 = gr.Dropdown(LANGUAGES,value="English",label="์ถœ๋ ฅ ์–ธ์–ด")
file2= gr.File(label="PDF/์ด๋ฏธ์ง€ ์—…๋กœ๋“œ",
file_types=[".pdf",".png",".jpg",".jpeg",
".bmp",".tiff",".gif"])
doc2 = gr.Button("๋ฒˆ์—ญ")
o2 = gr.Textbox(label="์ถ”์ถœ ์›๋ฌธ",lines=15)
t2 = gr.Textbox(label="๋ฒˆ์—ญ ๊ฒฐ๊ณผ",lines=15)
doc2.click(translate_doc,[file2,src2,tgt2],[o2,t2])
# ํƒญ 3
with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด"):
src3 = gr.Dropdown(LANGUAGES,value="Korean",label="์ž…๋ ฅ ์–ธ์–ด")
tgt3 = gr.Dropdown(LANGUAGES,value="English",label="์ถœ๋ ฅ ์–ธ์–ด")
mic3 = gr.Audio(sources=["microphone"],streaming=True)
o3 = gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)",lines=8)
t3 = gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)",lines=8)
st3 = gr.State()
mic3.stream(stream_single,
inputs=[src3,tgt3,st3],
outputs=[o3,t3,st3])
# ํƒญ 4
with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4๊ฐœ ์–ธ์–ด"):
src4 = gr.Dropdown(LANGUAGES,value="Korean",label="์ž…๋ ฅ ์–ธ์–ด")
mic4 = gr.Audio(sources=["microphone"],streaming=True)
o4 = gr.Textbox(label="์›๋ฌธ",lines=8)
e4 = gr.Textbox(label="English",lines=8)
c4 = gr.Textbox(label="Chinese(็ฎ€ไฝ“)",lines=8)
th4 = gr.Textbox(label="Thai",lines=8)
r4 = gr.Textbox(label="Russian",lines=8)
st4 = gr.State()
mic4.stream(stream_multi,
inputs=[src4,st4],
outputs=[o4,e4,c4,th4,r4,st4])
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 7. ์‹คํ–‰ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
if __name__ == "__main__":
app.launch(server_name="0.0.0.0",server_port=7860,share=False,debug=True)