File size: 8,238 Bytes
effad1c
364ce74
 
 
 
effad1c
 
b3067c5
364ce74
6bdc489
6b6f26e
364ce74
b3067c5
364ce74
6bdc489
364ce74
6bdc489
364ce74
 
 
 
 
6b6f26e
364ce74
fd022eb
364ce74
 
392a5eb
364ce74
a609646
 
364ce74
 
 
 
a609646
 
364ce74
 
 
 
 
a609646
364ce74
 
 
 
6b6f26e
364ce74
 
 
 
 
 
6b6f26e
364ce74
 
 
 
 
 
fd022eb
364ce74
6b6f26e
364ce74
 
 
 
 
 
 
 
 
 
 
 
 
fd022eb
364ce74
 
 
 
6b6f26e
364ce74
 
 
 
 
 
 
 
6b6f26e
364ce74
 
 
6b6f26e
364ce74
 
 
 
 
 
 
 
 
6b6f26e
364ce74
 
 
 
 
6b6f26e
364ce74
 
7cce69a
364ce74
392a5eb
364ce74
 
 
 
 
 
 
6b6f26e
364ce74
6b6f26e
364ce74
 
 
 
 
 
6b6f26e
364ce74
392a5eb
364ce74
 
 
 
 
6b6f26e
364ce74
 
 
 
 
 
 
 
6b6f26e
 
364ce74
 
6b6f26e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
SMARTok ๋ฐ๋ชจ โ€“ ์ด๋ฏธ์ง€ OCRยท์‹ค์‹œ๊ฐ„ ํƒญ ์˜ค๋ฅ˜ ์ˆ˜์ •๋ณธ
โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
โ€ข ์ด๋ฏธ์ง€ โ†’ ocrmypdf (+ghostscript) ์šฐ์„ , ์‹คํŒจ ์‹œ pytesseract ์ง์ ‘ OCR
โ€ข ์‹ค์‹œ๊ฐ„ 1ยท4์–ธ์–ด ํƒญ : State ์ธ์ž/์ถœ๋ ฅ ๊ฐœ์ˆ˜ ๋งž์ถฐ ๊ฒฝ๊ณ  ์ œ๊ฑฐ
"""

import gradio as gr
import openai, os, io, tempfile, mimetypes
from dotenv import load_dotenv
from PIL import Image
import pdfplumber, pytesseract, ocrmypdf, subprocess, shlex

# โ”€โ”€โ”€โ”€โ”€ 0. Init โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
load_dotenv()
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY", ""))

LANG = ["Korean","English","Japanese","Chinese",
        "Thai","Russian","Vietnamese","Spanish","French"]
LC   = {"Korean":"ko","English":"en","Japanese":"ja","Chinese":"zh",
        "Thai":"th","Russian":"ru","Vietnamese":"vi","Spanish":"es","French":"fr"}
VOICE= {l:("nova" if l in ["Korean","Japanese","Chinese"] else "alloy") for l in LANG}
FOUR = ["English","Chinese","Thai","Russian"]
CHUNK = 4  # sec

# โ”€โ”€โ”€โ”€โ”€ 1. Helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _safe(v): return None if v is None else (v["name"] if isinstance(v,dict) else v)

def _gpt(txt, src, tgt):
    rsp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role":"system",
                   "content":f"Translate {src} โ†’ {tgt}. Return only the translation."},
                  {"role":"user","content":txt}],
        temperature=0.3,max_tokens=4096)
    return rsp.choices[0].message.content.strip()

def _tts(txt, lang):
    out = client.audio.speech.create(model="tts-1",voice=VOICE.get(lang,"alloy"),
                                     input=txt[:4096])
    f = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
    f.write(out.content); f.close(); return f.name

# โ”€โ”€โ”€โ”€โ”€ 2. Single Audio translate โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def trans_audio(inp, src, tgt):
    p=_safe(inp)
    if not p or not os.path.exists(p): return "โš ๏ธ ํŒŒ์ผ ํ•„์š”","",None
    with open(p,"rb") as f:
        stt=client.audio.transcriptions.create(model="whisper-1",file=f,
                                               language=LC.get(src))
    orig=stt.text.strip(); 
    if not orig: return "โš ๏ธ ์ธ์‹ ์‹คํŒจ","",None
    trans=_gpt(orig,src,tgt)
    return orig,trans,_tts(trans,tgt)

# โ”€โ”€โ”€โ”€โ”€ 3. Doc/Image translate โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def trans_doc(file_in, src, tgt):
    p=_safe(file_in)
    if not p or not os.path.exists(p): return "โš ๏ธ ํŒŒ์ผ ์—…๋กœ๋“œ",""
    ext=os.path.splitext(p)[1].lower()
    mime=mimetypes.guess_type(p)[0] or ""
    try:
        if ext==".pdf" or "pdf" in mime:               # PDF
            with pdfplumber.open(p) as pdf:
                txt="\n".join(pg.extract_text() or "" for pg in pdf.pages[:5])
        else:                                          # ์ด๋ฏธ์ง€
            tmp_pdf=tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name
            Image.open(p).save(tmp_pdf,"PDF")
            ocr_pdf=tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name
            try:
                ocrmypdf.ocr(tmp_pdf,ocr_pdf,
                             lang=LC.get(src,"eng"),deskew=True,optimize=0,
                             progress_bar=False)
                with pdfplumber.open(ocr_pdf) as pdf:
                    txt="\n".join(pg.extract_text() or "" for pg in pdf.pages)
            except Exception:  # gs ์—†๊ฑฐ๋‚˜ ocrmypdf ์‹คํŒจ โ†’ ์ง์ ‘ OCR
                txt=pytesseract.image_to_string(Image.open(p), lang=LC.get(src,"eng"))
    except Exception as e:
        return f"โŒ ์ถ”์ถœ ์˜ค๋ฅ˜: {e}",""
    txt=txt.strip()
    if not txt: return "โš ๏ธ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹คํŒจ",""
    return txt,_gpt(txt,src,tgt)

# โ”€โ”€โ”€โ”€โ”€ 4. Real-time single lang โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def stream_one(path, src, tgt, state):
    state=state or {"o":"","t":""}
    if not path or not os.path.exists(path): return state["o"],state["t"],state
    with open(path,"rb") as f:
        stt=client.audio.transcriptions.create(model="whisper-1",file=f,
                                               language=LC.get(src))
    full=stt.text.strip(); new=full[len(state["o"]):]
    if new:
        state["o"]=full
        state["t"]+=" "+_gpt(new,src,tgt)
    return state["o"],state["t"].strip(),state

# โ”€โ”€โ”€โ”€โ”€ 5. Real-time 4 langs โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def stream_four(path, src, state):
    state=state or {k:"" for k in ["o"]+FOUR}
    if not path or not os.path.exists(path): 
        return state["o"],state["English"],state["Chinese"],state["Thai"],state["Russian"],state
    with open(path,"rb") as f:
        stt=client.audio.transcriptions.create(model="whisper-1",file=f,
                                               language=LC.get(src))
    full=stt.text.strip(); new=full[len(state["o"]):]
    if new:
        state["o"]=full
        for l in FOUR:
            state[l]+=" "+_gpt(new,src,l)
    return (state["o"].strip(),state["English"].strip(),state["Chinese"].strip(),
            state["Thai"].strip(),state["Russian"].strip(),state)

# โ”€โ”€โ”€โ”€โ”€ 6. UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(title="SMARTok Demo",theme=gr.themes.Soft()) as app:
    with gr.Tabs():
        # ํƒญ1
        with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค ๋ฒˆ์—ญ"):
            s1=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
            t1=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
            a1=gr.Audio(sources=["microphone","upload"],type="filepath")
            btn1=gr.Button("๋ฒˆ์—ญ")
            o1=gr.Textbox(label="์›๋ฌธ",lines=5); tr1=gr.Textbox(label="๋ฒˆ์—ญ",lines=5)
            aud1=gr.Audio(label="TTS",type="filepath",autoplay=True)
            btn1.click(trans_audio,[a1,s1,t1],[o1,tr1,aud1])

        # ํƒญ2
        with gr.TabItem("๐Ÿ“„ ๋ฌธ์„œยท์ด๋ฏธ์ง€ ๋ฒˆ์—ญ"):
            s2=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ")
            t2=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
            f2=gr.File(file_types=[".pdf",".png",".jpg",".jpeg",".bmp",".tiff",".gif"])
            btn2=gr.Button("๋ฒˆ์—ญ")
            o2=gr.Textbox(label="์ถ”์ถœ ์›๋ฌธ",lines=15); tr2=gr.Textbox(label="๋ฒˆ์—ญ ๊ฒฐ๊ณผ",lines=15)
            btn2.click(trans_doc,[f2,s2,t2],[o2,tr2])

        # ํƒญ3
        with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด"):
            s3=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ"); t3=gr.Dropdown(LANG,value="English",label="์ถœ๋ ฅ")
            mic3=gr.Audio(sources=["microphone"],streaming=True)
            o3=tr3=gr.Textbox(lines=8,label="์›๋ฌธ / ๋ฒˆ์—ญ")
            st3=gr.State()
            mic3.stream(stream_one,inputs=[s3,t3,st3],outputs=[o3,tr3,st3])

        # ํƒญ4
        with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด"):
            s4=gr.Dropdown(LANG,value="Korean",label="์ž…๋ ฅ ์–ธ์–ด")
            mic4=gr.Audio(sources=["microphone"],streaming=True)
            o4=gr.Textbox(label="์›๋ฌธ",lines=8); e4=gr.Textbox(label="English",lines=8)
            c4=gr.Textbox(label="Chinese(็ฎ€ไฝ“)",lines=8); th4=gr.Textbox(label="Thai",lines=8); r4=gr.Textbox(label="Russian",lines=8)
            st4=gr.State()
            mic4.stream(stream_four,inputs=[s4,st4],
                        outputs=[o4,e4,c4,th4,r4,st4])

# โ”€โ”€โ”€โ”€โ”€ 7. Run โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
if __name__=="__main__":
    app.launch(server_name="0.0.0.0",server_port=7860,share=False,debug=True)