Spaces:
Running
on
Zero
Running
on
Zero
""" | |
SMARTok ๋ฐ๋ชจ โ ์ด๋ฏธ์ง OCRยท์ค์๊ฐ ํญ ์ค๋ฅ ์์ ๋ณธ | |
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
โข ์ด๋ฏธ์ง โ ocrmypdf (+ghostscript) ์ฐ์ , ์คํจ ์ pytesseract ์ง์ OCR | |
โข ์ค์๊ฐ 1ยท4์ธ์ด ํญ : State ์ธ์/์ถ๋ ฅ ๊ฐ์ ๋ง์ถฐ ๊ฒฝ๊ณ ์ ๊ฑฐ | |
""" | |
import gradio as gr | |
import openai, os, io, tempfile, mimetypes | |
from dotenv import load_dotenv | |
from PIL import Image | |
import pdfplumber, pytesseract, ocrmypdf, subprocess, shlex | |
# โโโโโ 0. Init โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
load_dotenv() | |
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY", "")) | |
LANG = ["Korean","English","Japanese","Chinese", | |
"Thai","Russian","Vietnamese","Spanish","French"] | |
LC = {"Korean":"ko","English":"en","Japanese":"ja","Chinese":"zh", | |
"Thai":"th","Russian":"ru","Vietnamese":"vi","Spanish":"es","French":"fr"} | |
VOICE= {l:("nova" if l in ["Korean","Japanese","Chinese"] else "alloy") for l in LANG} | |
FOUR = ["English","Chinese","Thai","Russian"] | |
CHUNK = 4 # sec | |
# โโโโโ 1. Helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def _safe(v): return None if v is None else (v["name"] if isinstance(v,dict) else v) | |
def _gpt(txt, src, tgt): | |
rsp = client.chat.completions.create( | |
model="gpt-3.5-turbo", | |
messages=[{"role":"system", | |
"content":f"Translate {src} โ {tgt}. Return only the translation."}, | |
{"role":"user","content":txt}], | |
temperature=0.3,max_tokens=4096) | |
return rsp.choices[0].message.content.strip() | |
def _tts(txt, lang): | |
out = client.audio.speech.create(model="tts-1",voice=VOICE.get(lang,"alloy"), | |
input=txt[:4096]) | |
f = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3") | |
f.write(out.content); f.close(); return f.name | |
# โโโโโ 2. Single Audio translate โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def trans_audio(inp, src, tgt): | |
p=_safe(inp) | |
if not p or not os.path.exists(p): return "โ ๏ธ ํ์ผ ํ์","",None | |
with open(p,"rb") as f: | |
stt=client.audio.transcriptions.create(model="whisper-1",file=f, | |
language=LC.get(src)) | |
orig=stt.text.strip(); | |
if not orig: return "โ ๏ธ ์ธ์ ์คํจ","",None | |
trans=_gpt(orig,src,tgt) | |
return orig,trans,_tts(trans,tgt) | |
# โโโโโ 3. Doc/Image translate โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def trans_doc(file_in, src, tgt): | |
p=_safe(file_in) | |
if not p or not os.path.exists(p): return "โ ๏ธ ํ์ผ ์ ๋ก๋","" | |
ext=os.path.splitext(p)[1].lower() | |
mime=mimetypes.guess_type(p)[0] or "" | |
try: | |
if ext==".pdf" or "pdf" in mime: # PDF | |
with pdfplumber.open(p) as pdf: | |
txt="\n".join(pg.extract_text() or "" for pg in pdf.pages[:5]) | |
else: # ์ด๋ฏธ์ง | |
tmp_pdf=tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name | |
Image.open(p).save(tmp_pdf,"PDF") | |
ocr_pdf=tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name | |
try: | |
ocrmypdf.ocr(tmp_pdf,ocr_pdf, | |
lang=LC.get(src,"eng"),deskew=True,optimize=0, | |
progress_bar=False) | |
with pdfplumber.open(ocr_pdf) as pdf: | |
txt="\n".join(pg.extract_text() or "" for pg in pdf.pages) | |
except Exception: # gs ์๊ฑฐ๋ ocrmypdf ์คํจ โ ์ง์ OCR | |
txt=pytesseract.image_to_string(Image.open(p), lang=LC.get(src,"eng")) | |
except Exception as e: | |
return f"โ ์ถ์ถ ์ค๋ฅ: {e}","" | |
txt=txt.strip() | |
if not txt: return "โ ๏ธ ํ ์คํธ ์ถ์ถ ์คํจ","" | |
return txt,_gpt(txt,src,tgt) | |
# โโโโโ 4. Real-time single lang โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def stream_one(path, src, tgt, state): | |
state=state or {"o":"","t":""} | |
if not path or not os.path.exists(path): return state["o"],state["t"],state | |
with open(path,"rb") as f: | |
stt=client.audio.transcriptions.create(model="whisper-1",file=f, | |
language=LC.get(src)) | |
full=stt.text.strip(); new=full[len(state["o"]):] | |
if new: | |
state["o"]=full | |
state["t"]+=" "+_gpt(new,src,tgt) | |
return state["o"],state["t"].strip(),state | |
# โโโโโ 5. Real-time 4 langs โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def stream_four(path, src, state): | |
state=state or {k:"" for k in ["o"]+FOUR} | |
if not path or not os.path.exists(path): | |
return state["o"],state["English"],state["Chinese"],state["Thai"],state["Russian"],state | |
with open(path,"rb") as f: | |
stt=client.audio.transcriptions.create(model="whisper-1",file=f, | |
language=LC.get(src)) | |
full=stt.text.strip(); new=full[len(state["o"]):] | |
if new: | |
state["o"]=full | |
for l in FOUR: | |
state[l]+=" "+_gpt(new,src,l) | |
return (state["o"].strip(),state["English"].strip(),state["Chinese"].strip(), | |
state["Thai"].strip(),state["Russian"].strip(),state) | |
# โโโโโ 6. UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
with gr.Blocks(title="SMARTok Demo",theme=gr.themes.Soft()) as app: | |
with gr.Tabs(): | |
# ํญ1 | |
with gr.TabItem("๐๏ธ ์ค๋์ค ๋ฒ์ญ"): | |
s1=gr.Dropdown(LANG,value="Korean",label="์ ๋ ฅ") | |
t1=gr.Dropdown(LANG,value="English",label="์ถ๋ ฅ") | |
a1=gr.Audio(sources=["microphone","upload"],type="filepath") | |
btn1=gr.Button("๋ฒ์ญ") | |
o1=gr.Textbox(label="์๋ฌธ",lines=5); tr1=gr.Textbox(label="๋ฒ์ญ",lines=5) | |
aud1=gr.Audio(label="TTS",type="filepath",autoplay=True) | |
btn1.click(trans_audio,[a1,s1,t1],[o1,tr1,aud1]) | |
# ํญ2 | |
with gr.TabItem("๐ ๋ฌธ์ยท์ด๋ฏธ์ง ๋ฒ์ญ"): | |
s2=gr.Dropdown(LANG,value="Korean",label="์ ๋ ฅ") | |
t2=gr.Dropdown(LANG,value="English",label="์ถ๋ ฅ") | |
f2=gr.File(file_types=[".pdf",".png",".jpg",".jpeg",".bmp",".tiff",".gif"]) | |
btn2=gr.Button("๋ฒ์ญ") | |
o2=gr.Textbox(label="์ถ์ถ ์๋ฌธ",lines=15); tr2=gr.Textbox(label="๋ฒ์ญ ๊ฒฐ๊ณผ",lines=15) | |
btn2.click(trans_doc,[f2,s2,t2],[o2,tr2]) | |
# ํญ3 | |
with gr.TabItem("โฑ๏ธ ์ค์๊ฐ 1์ธ์ด"): | |
s3=gr.Dropdown(LANG,value="Korean",label="์ ๋ ฅ"); t3=gr.Dropdown(LANG,value="English",label="์ถ๋ ฅ") | |
mic3=gr.Audio(sources=["microphone"],streaming=True) | |
o3=tr3=gr.Textbox(lines=8,label="์๋ฌธ / ๋ฒ์ญ") | |
st3=gr.State() | |
mic3.stream(stream_one,inputs=[s3,t3,st3],outputs=[o3,tr3,st3]) | |
# ํญ4 | |
with gr.TabItem("๐ ์ค์๊ฐ 4์ธ์ด"): | |
s4=gr.Dropdown(LANG,value="Korean",label="์ ๋ ฅ ์ธ์ด") | |
mic4=gr.Audio(sources=["microphone"],streaming=True) | |
o4=gr.Textbox(label="์๋ฌธ",lines=8); e4=gr.Textbox(label="English",lines=8) | |
c4=gr.Textbox(label="Chinese(็ฎไฝ)",lines=8); th4=gr.Textbox(label="Thai",lines=8); r4=gr.Textbox(label="Russian",lines=8) | |
st4=gr.State() | |
mic4.stream(stream_four,inputs=[s4,st4], | |
outputs=[o4,e4,c4,th4,r4,st4]) | |
# โโโโโ 7. Run โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
if __name__=="__main__": | |
app.launch(server_name="0.0.0.0",server_port=7860,share=False,debug=True) | |