Spaces:
Running
on
Zero
Running
on
Zero
""" | |
SMARTok ์ค์๊ฐ ๋ค๊ตญ์ด ๋ฐ๋ชจ (์์ ์์ ๋ณธ) | |
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
โข ํญ1 ๐๏ธ ์ค๋์ค ๋ฒ์ญ : ๋ง์ดํฌ/ํ์ผ โ ๋ฒ์ญ + TTS | |
โข ํญ2 ๐ ๋ฌธ์ยท์ด๋ฏธ์ง ๋ฒ์ญ : PDF / ์ด๋ฏธ์ง(OCR) โ ๋ฒ์ญ | |
โข ํญ3 โฑ๏ธ ์ค์๊ฐ 1์ธ์ด ๋ฒ์ญ : ๋ง์ดํฌ โ 1๊ฐ ์ธ์ด ์ค์๊ฐ ์๋ง | |
โข ํญ4 ๐ ์ค์๊ฐ 4๊ฐ ์ธ์ด ๋ฒ์ญ : ๋ง์ดํฌ โ ์ยท์คยทํยท๋ฌ ๋์ ์๋ง | |
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
ํ์ apt : tesseract-ocr libtesseract-dev ocrmypdf ffmpeg | |
ํ์ pip : gradio>=5.33 openai python-dotenv pdfplumber ocrmypdf pillow | |
""" | |
import gradio as gr | |
import openai, os, io, tempfile, mimetypes, json, uuid | |
from dotenv import load_dotenv | |
import pdfplumber, ocrmypdf | |
from PIL import Image | |
# โโโโโโโโโโโโโโ 0. ๊ณตํต ์ด๊ธฐํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
load_dotenv() | |
api_key = os.getenv("OPENAI_API_KEY") | |
if not api_key: | |
raise RuntimeError("OPENAI_API_KEY๋ฅผ .env ํ์ผ์ ์ค์ ํ์ธ์!") | |
client = openai.OpenAI(api_key=api_key) | |
LANGUAGES = [ | |
"Korean", "English", "Japanese", "Chinese", | |
"Thai", "Russian", "Vietnamese", | |
"Spanish", "French" | |
] | |
LANG_CODE = { | |
"Korean":"ko","English":"en","Japanese":"ja","Chinese":"zh", | |
"Thai":"th","Russian":"ru","Vietnamese":"vi", | |
"Spanish":"es","French":"fr" | |
} | |
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy") | |
for l in LANGUAGES} | |
FOUR = ["English","Chinese","Thai","Russian"] | |
STREAM_SEC = 4 # Whisper ํธ์ถ ์ฃผ๊ธฐ | |
# โโโโโโโโโโโโโโ 1. ์ ํธ ํจ์ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def _safe(v): | |
if v is None: | |
return None | |
return v["name"] if isinstance(v, dict) else v | |
def _gpt(text, src, tgt): | |
rsp = client.chat.completions.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role":"system", | |
"content":f"Translate the following {src} text to {tgt}. " | |
"Return only the translation."}, | |
{"role":"user","content":text} | |
], | |
temperature=0.3,max_tokens=4096 | |
) | |
return rsp.choices[0].message.content.strip() | |
def _tts(text, lang): | |
rsp = client.audio.speech.create( | |
model="tts-1", | |
voice=VOICE.get(lang,"alloy"), | |
input=text[:4096] | |
) | |
tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3") | |
tmp.write(rsp.content); tmp.close() | |
return tmp.name | |
# โโโโโโโโโโโโโโ 2. ์ค๋์ค(๋จ๊ฑด) ๋ฒ์ญ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def translate_audio(audio_in, src, tgt): | |
p = _safe(audio_in) | |
if not p or not os.path.exists(p): | |
return "โ ๏ธ ์์ฑ ํ์ผ ํ์", "", None | |
with open(p,"rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", file=f, language=LANG_CODE.get(src)) | |
orig = stt.text.strip() | |
if not orig: | |
return "โ ๏ธ ์์ฑ ์ธ์ ์คํจ", "", None | |
trans = _gpt(orig, src, tgt) | |
return orig, trans, _tts(trans, tgt) | |
# โโโโโโโโโโโโโโ 3. ๋ฌธ์ / ์ด๋ฏธ์ง ๋ฒ์ญ โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def translate_doc(file_in, src, tgt): | |
p = _safe(file_in) | |
if not p or not os.path.exists(p): | |
return "โ ๏ธ PDF/์ด๋ฏธ์ง ์ ๋ก๋", "" | |
ext = os.path.splitext(p)[1].lower() | |
mime = mimetypes.guess_type(p)[0] or "" | |
try: | |
# PDF ๊ทธ๋๋ก | |
if ext==".pdf" or "pdf" in mime: | |
with pdfplumber.open(p) as pdf: | |
txt = "\n".join(pg.extract_text() or "" for pg in pdf.pages[:5]) | |
# ์ด๋ฏธ์ง โ OCR PDF | |
else: | |
img_pdf = tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name | |
Image.open(p).save(img_pdf,"PDF") | |
ocr_pdf = tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name | |
ocrmypdf.ocr(img_pdf, ocr_pdf, | |
lang=LANG_CODE.get(src,"eng"), | |
deskew=True,optimize=0,progress_bar=False) | |
with pdfplumber.open(ocr_pdf) as pdf: | |
txt = "\n".join(pg.extract_text() or "" for pg in pdf.pages) | |
except Exception as e: | |
return f"โ ์ถ์ถ ์ค๋ฅ: {e}", "" | |
txt = txt.strip() | |
if not txt: | |
return "โ ๏ธ ํ ์คํธ ์ถ์ถ ์คํจ", "" | |
return txt, _gpt(txt, src, tgt) | |
# โโโโโโโโโโโโโโ 4. ์ค์๊ฐ 1์ธ์ด โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def stream_single(audio_path, src, tgt, state): | |
state = state or {"orig":"", "trans":""} | |
if not audio_path or not os.path.exists(audio_path): | |
return state["orig"], state["trans"], state | |
with open(audio_path,"rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", file=f, language=LANG_CODE.get(src)) | |
full = stt.text.strip() | |
new = full[len(state["orig"]):] | |
if new: | |
state["orig"] = full | |
state["trans"] += " " + _gpt(new, src, tgt) | |
return state["orig"], state["trans"].strip(), state | |
# โโโโโโโโโโโโโโ 5. ์ค์๊ฐ 4์ธ์ด โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def stream_multi(audio_path, src, state): | |
state = state or {k:"" for k in ["orig"]+FOUR} | |
if not audio_path or not os.path.exists(audio_path): | |
return state["orig"],state["English"],state["Chinese"],state["Thai"],state["Russian"],state | |
with open(audio_path,"rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", file=f, language=LANG_CODE.get(src)) | |
full = stt.text.strip() | |
new = full[len(state["orig"]):] | |
if new: | |
state["orig"] = full | |
for lang in FOUR: | |
state[lang] += " " + _gpt(new, src, lang) | |
return (state["orig"].strip(), | |
state["English"].strip(), | |
state["Chinese"].strip(), | |
state["Thai"].strip(), | |
state["Russian"].strip(), | |
state) | |
# โโโโโโโโโโโโโโ 6. Gradio UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app: | |
with gr.Tabs(): | |
# ํญ 1 | |
with gr.TabItem("๐๏ธ ์ค๋์ค ๋ฒ์ญ"): | |
src1 = gr.Dropdown(LANGUAGES,value="Korean",label="์ ๋ ฅ ์ธ์ด") | |
tgt1 = gr.Dropdown(LANGUAGES,value="English",label="์ถ๋ ฅ ์ธ์ด") | |
aud1 = gr.Audio(sources=["microphone","upload"],type="filepath") | |
res1 = gr.Button("๋ฒ์ญ") | |
o1 = gr.Textbox(label="์๋ฌธ",lines=5) | |
t1 = gr.Textbox(label="๋ฒ์ญ",lines=5) | |
a1 = gr.Audio(label="TTS",type="filepath",autoplay=True) | |
res1.click(translate_audio,[aud1,src1,tgt1],[o1,t1,a1]) | |
# ํญ 2 | |
with gr.TabItem("๐ ๋ฌธ์ยท์ด๋ฏธ์ง ๋ฒ์ญ"): | |
src2 = gr.Dropdown(LANGUAGES,value="Korean",label="์ ๋ ฅ ์ธ์ด") | |
tgt2 = gr.Dropdown(LANGUAGES,value="English",label="์ถ๋ ฅ ์ธ์ด") | |
file2= gr.File(label="PDF/์ด๋ฏธ์ง ์ ๋ก๋", | |
file_types=[".pdf",".png",".jpg",".jpeg", | |
".bmp",".tiff",".gif"]) | |
doc2 = gr.Button("๋ฒ์ญ") | |
o2 = gr.Textbox(label="์ถ์ถ ์๋ฌธ",lines=15) | |
t2 = gr.Textbox(label="๋ฒ์ญ ๊ฒฐ๊ณผ",lines=15) | |
doc2.click(translate_doc,[file2,src2,tgt2],[o2,t2]) | |
# ํญ 3 | |
with gr.TabItem("โฑ๏ธ ์ค์๊ฐ 1์ธ์ด"): | |
src3 = gr.Dropdown(LANGUAGES,value="Korean",label="์ ๋ ฅ ์ธ์ด") | |
tgt3 = gr.Dropdown(LANGUAGES,value="English",label="์ถ๋ ฅ ์ธ์ด") | |
mic3 = gr.Audio(sources=["microphone"],streaming=True) | |
o3 = gr.Textbox(label="์๋ฌธ(์ค์๊ฐ)",lines=8) | |
t3 = gr.Textbox(label="๋ฒ์ญ(์ค์๊ฐ)",lines=8) | |
st3 = gr.State() | |
mic3.stream(stream_single, | |
inputs=[src3,tgt3,st3], | |
outputs=[o3,t3,st3]) | |
# ํญ 4 | |
with gr.TabItem("๐ ์ค์๊ฐ 4๊ฐ ์ธ์ด"): | |
src4 = gr.Dropdown(LANGUAGES,value="Korean",label="์ ๋ ฅ ์ธ์ด") | |
mic4 = gr.Audio(sources=["microphone"],streaming=True) | |
o4 = gr.Textbox(label="์๋ฌธ",lines=8) | |
e4 = gr.Textbox(label="English",lines=8) | |
c4 = gr.Textbox(label="Chinese(็ฎไฝ)",lines=8) | |
th4 = gr.Textbox(label="Thai",lines=8) | |
r4 = gr.Textbox(label="Russian",lines=8) | |
st4 = gr.State() | |
mic4.stream(stream_multi, | |
inputs=[src4,st4], | |
outputs=[o4,e4,c4,th4,r4,st4]) | |
# โโโโโโโโโโโโโโ 7. ์คํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
if __name__ == "__main__": | |
app.launch(server_name="0.0.0.0",server_port=7860,share=False,debug=True) | |