voice-trans / app.py
openfree's picture
Update app.py
364ce74 verified
raw
history blame
8.24 kB
"""
SMARTok ๋ฐ๋ชจ โ€“ ์ด๋ฏธ์ง€ OCRยท์‹ค์‹œ๊ฐ„ ํƒญ ์˜ค๋ฅ˜ ์ˆ˜์ •๋ณธ
โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
โ€ข ์ด๋ฏธ์ง€ โ†’ ocrmypdf (+ghostscript) ์šฐ์„ , ์‹คํŒจ ์‹œ pytesseract ์ง์ ‘ OCR
โ€ข ์‹ค์‹œ๊ฐ„ 1ยท4์–ธ์–ด ํƒญ : State ์ธ์ž/์ถœ๋ ฅ ๊ฐœ์ˆ˜ ๋งž์ถฐ ๊ฒฝ๊ณ  ์ œ๊ฑฐ
"""
import gradio as gr
import openai, os, io, tempfile, mimetypes
from dotenv import load_dotenv
from PIL import Image
import pdfplumber, pytesseract, ocrmypdf, subprocess, shlex
# โ”€โ”€โ”€โ”€โ”€ 0. Init โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
load_dotenv()
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY", ""))
LANG = ["Korean","English","Japanese","Chinese",
"Thai","Russian","Vietnamese","Spanish","French"]
LC = {"Korean":"ko","English":"en","Japanese":"ja","Chinese":"zh",
"Thai":"th","Russian":"ru","Vietnamese":"vi","Spanish":"es","French":"fr"}
VOICE= {l:("nova" if l in ["Korean","Japanese","Chinese"] else "alloy") for l in LANG}
FOUR = ["English","Chinese","Thai","Russian"]
CHUNK = 4 # sec
# โ”€โ”€โ”€โ”€โ”€ 1. Helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _safe(v): return None if v is None else (v["name"] if isinstance(v,dict) else v)
def _gpt(txt, src, tgt):
rsp = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role":"system",
"content":f"Translate {src} โ†’ {tgt}. Return only the translation."},
{"role":"user","content":txt}],
temperature=0.3,max_tokens=4096)
return rsp.choices[0].message.content.strip()
def _tts(txt, lang):
out = client.audio.speech.create(model="tts-1",voice=VOICE.get(lang,"alloy"),
input=txt[:4096])
f = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
f.write(out.content); f.close(); return f.name
# โ”€โ”€โ”€โ”€โ”€ 2. Single Audio translate โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def trans_audio(inp, src, tgt):
p=_safe(inp)
if not p or not os.path.exists(p): return "โš ๏ธ ํŒŒ์ผ ํ•„์š”","",None
with open(p,"rb") as f:
stt=client.audio.transcriptions.create(model="whisper-1",file=f,
language=LC.get(src))
orig=stt.text.strip();
if not orig: return "โš ๏ธ ์ธ์‹ ์‹คํŒจ","",None
trans=_gpt(orig,src,tgt)
return orig,trans,_tts(trans,tgt)
# โ”€โ”€โ”€โ”€โ”€ 3. Doc/Image translate โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def trans_doc(file_in, src, tgt):
p=_safe(file_in)
if not p or not os.path.exists(p): return "โš ๏ธ ํŒŒ์ผ ์—…๋กœ๋“œ",""
ext=os.path.splitext(p)[1].lower()
mime=mimetypes.guess_type(p)[0] or ""
try:
if ext==".pdf" or "pdf" in mime: # PDF
with pdfplumber.open(p) as pdf:
txt="\n".join(pg.extract_text() or "" for pg in pdf.pages[:5])
else: # ์ด๋ฏธ์ง€
tmp_pdf=tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name
Image.open(p).save(tmp_pdf,"PDF")
ocr_pdf=tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name
try:
ocrmypdf.ocr(tmp_pdf,ocr_pdf,
lang=LC.get(src,"eng"),deskew=True,optimize=0,
progress_bar=False)
with pdfplumber.open(ocr_pdf) as pdf:
txt="\n".join(pg.extract_text() or "" for pg in pdf.pages)
except Exception: # gs ์—†๊ฑฐ๋‚˜ ocrmypdf ์‹คํŒจ โ†’ ์ง์ ‘ OCR
txt=pytesseract.image_to_string(Image.open(p), lang=LC.get(src,"eng"))
except Exception as e:
return f"โŒ ์ถ”์ถœ ์˜ค๋ฅ˜: {e}",""
txt=txt.strip()
if not txt: return "โš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ",""
return txt,_gpt(txt,src,tgt)
# โ”€โ”€โ”€โ”€โ”€ 4. Real-time single lang โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def stream_one(path, src, tgt, state):
state=state or {"o":"","t":""}
if not path or not os.path.exists(path): return state["o"],state["t"],state
with open(path,"rb") as f:
stt=client.audio.transcriptions.create(model="whisper-1",file=f,
language=LC.get(src))
full=stt.text.strip(); new=full[len(state["o"]):]
if new:
state["o"]=full
state["t"]+=" "+_gpt(new,src,tgt)
return state["o"],state["t"].strip(),state
# โ”€โ”€โ”€โ”€โ”€ 5. Real-time 4 langs โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def stream_four(path, src, state):
state=state or {k:"" for k in ["o"]+FOUR}
if not path or not os.path.exists(path):
return state["o"],state["English"],state["Chinese"],state["Thai"],state["Russian"],state
with open(path,"rb") as f:
stt=client.audio.transcriptions.create(model="whisper-1",file=f,
language=LC.get(src))
full=stt.text.strip(); new=full[len(state["o"]):]
if new:
state["o"]=full
for l in FOUR:
state[l]+=" "+_gpt(new,src,l)
return (state["o"].strip(),state["English"].strip(),state["Chinese"].strip(),
state["Thai"].strip(),state["Russian"].strip(),state)
# โ”€โ”€โ”€โ”€โ”€ 6. UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(title="SMARTok Demo",theme=gr.themes.Soft()) as app:
with gr.Tabs():
# ํƒญ1
with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค ๋ฒˆ์—ญ"):
s1=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
t1=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
a1=gr.Audio(sources=["microphone","upload"],type="filepath")
btn1=gr.Button("๋ฒˆ์—ญ")
o1=gr.Textbox(label="์›๋ฌธ",lines=5); tr1=gr.Textbox(label="๋ฒˆ์—ญ",lines=5)
aud1=gr.Audio(label="TTS",type="filepath",autoplay=True)
btn1.click(trans_audio,[a1,s1,t1],[o1,tr1,aud1])
# ํƒญ2
with gr.TabItem("๐Ÿ“„ ๋ฌธ์„œยท์ด๋ฏธ์ง€ ๋ฒˆ์—ญ"):
s2=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
t2=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
f2=gr.File(file_types=[".pdf",".png",".jpg",".jpeg",".bmp",".tiff",".gif"])
btn2=gr.Button("๋ฒˆ์—ญ")
o2=gr.Textbox(label="์ถ”์ถœ ์›๋ฌธ",lines=15); tr2=gr.Textbox(label="๋ฒˆ์—ญ ๊ฒฐ๊ณผ",lines=15)
btn2.click(trans_doc,[f2,s2,t2],[o2,tr2])
# ํƒญ3
with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด"):
s3=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ"); t3=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
mic3=gr.Audio(sources=["microphone"],streaming=True)
o3=tr3=gr.Textbox(lines=8,label="์›๋ฌธ / ๋ฒˆ์—ญ")
st3=gr.State()
mic3.stream(stream_one,inputs=[s3,t3,st3],outputs=[o3,tr3,st3])
# ํƒญ4
with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด"):
s4=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ ์–ธ์–ด")
mic4=gr.Audio(sources=["microphone"],streaming=True)
o4=gr.Textbox(label="์›๋ฌธ",lines=8); e4=gr.Textbox(label="English",lines=8)
c4=gr.Textbox(label="Chinese(็ฎ€ไฝ“)",lines=8); th4=gr.Textbox(label="Thai",lines=8); r4=gr.Textbox(label="Russian",lines=8)
st4=gr.State()
mic4.stream(stream_four,inputs=[s4,st4],
outputs=[o4,e4,c4,th4,r4,st4])
# โ”€โ”€โ”€โ”€โ”€ 7. Run โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
if __name__=="__main__":
app.launch(server_name="0.0.0.0",server_port=7860,share=False,debug=True)