Spaces:
Running
on
Zero
Running
on
Zero
""" | |
SMARTok ํต์ฌ ๋ฐ๋ชจ | |
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
โ ํญ 1 : ์ค๋์ค(๋ น์ยทํ์ผ) ๋จ๊ฑด ๋ฒ์ญ + TTS ์ฌ์ | |
โ ํญ 2 : PDF / ์ด๋ฏธ์ง ๋ฒ์ญ (Tesseract ์์ผ๋ฉด PDF๋ง ์๋ด) | |
โ ํญ 3 : ์ค์๊ฐ 1๊ฐ ์ธ์ด(์ ํํ) ๋ฒ์ญ | |
โ ํญ 4 : ์ค์๊ฐ 4๊ฐ ์ธ์ด(์ยท์ค(๊ฐ)ยทํยท๋ฌ) ๋์ ๋ฒ์ญ | |
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
Python โฅ3.10, Gradio 4.x, OpenAI Python SDK ํ์ | |
""" | |
import gradio as gr | |
import openai, os, io, tempfile, mimetypes | |
from dotenv import load_dotenv | |
# โโโโโโโโโโโโโโโโโโโ 0. ๊ณตํต ์ด๊ธฐํ โโโโโโโโโโโโโโโโโโโโโโโโโโ | |
load_dotenv() | |
api_key = os.getenv("OPENAI_API_KEY") | |
if not api_key: | |
raise RuntimeError("OPENAI_API_KEY๋ฅผ .env ํ์ผ์ ์ค์ ํ์ธ์!") | |
client = openai.OpenAI(api_key=api_key) | |
LANGUAGES = [ | |
"Korean", "English", "Japanese", "Chinese", | |
"Thai", "Russian", "Vietnamese", | |
"Spanish", "French" | |
] | |
LANG_CODE = { | |
"Korean":"ko","English":"en","Japanese":"ja","Chinese":"zh", | |
"Thai":"th","Russian":"ru","Vietnamese":"vi", | |
"Spanish":"es","French":"fr" | |
} | |
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy") | |
for l in LANGUAGES} | |
FOUR_LANGS = ["English", "Chinese", "Thai", "Russian"] # ์ค์๊ฐ ๋์ ๋ฒ์ญ์ฉ | |
STREAM_SEC = 4 # Whisper ํธ์ถ ๊ฐ๊ฒฉ(์ด) โ 3~4 ์ด ์ ๋ ์ง์ฐ | |
# โโโโโโโโโโโโโโโโโโโ 1. ์ ํธ ํจ์ โโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def _safe_path(v): | |
"""Gradio File/Audio ์ ๋ ฅ โ ์ค์ ๊ฒฝ๋ก ์ถ์ถ""" | |
if v is None: | |
return None | |
return v.get("name") if isinstance(v, dict) else v | |
def _gpt_translate(text: str, src: str, tgt: str) -> str: | |
"""GPT-3.5-turbo ๋ฒ์ญ (์ค๋ช ์์ด ๊ฒฐ๊ณผ๋ง)""" | |
rsp = client.chat.completions.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role":"system", | |
"content":f"You are a professional translator. Translate the following {src} text to {tgt}. " | |
f"Only provide the translated text."}, | |
{"role":"user","content":text} | |
], | |
temperature=0.3,max_tokens=4096 | |
) | |
return rsp.choices[0].message.content.strip() | |
def _tts(text: str, lang: str) -> str: | |
"""OpenAI TTS-1 โ MP3 ํ์ผ ๊ฒฝ๋ก ๋ฐํ""" | |
out = client.audio.speech.create( | |
model="tts-1", | |
voice=VOICE.get(lang,"alloy"), | |
input=text[:4096] | |
) | |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") | |
tmp.write(out.content) | |
tmp.close() | |
return tmp.name | |
# โโโโโโโโโโโโโโโโโโโ 2. ๋จ๊ฑด ์ค๋์ค ๋ฒ์ญ โโโโโโโโโโโโโโโโโโโโโโโ | |
def translate_audio(audio_in, src, tgt): | |
path = _safe_path(audio_in) | |
if not path or not os.path.exists(path): | |
return "โ ๏ธ ์์ฑ ํ์ผ์ ๋ น์-์ ๋ก๋ํ์ธ์.", "", None | |
with open(path,"rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", | |
file=f, | |
language=LANG_CODE.get(src) | |
) | |
original = stt.text.strip() | |
if not original: | |
return "โ ๏ธ ์์ฑ ์ธ์ ์คํจ", "", None | |
translated = _gpt_translate(original, src, tgt) | |
tts_path = _tts(translated, tgt) | |
return original, translated, tts_path | |
# โโโโโโโโโโโโโโโโโโโ 3. PDF / ์ด๋ฏธ์ง ๋ฒ์ญ โโโโโโโโโโโโโโโโโโโโโ | |
def translate_document(file_in, src, tgt): | |
path = _safe_path(file_in) | |
if not path or not os.path.exists(path): | |
return "โ ๏ธ PDF(๋๋ ์ด๋ฏธ์ง) ํ์ผ์ ์ ๋ก๋ํ์ธ์.", "" | |
ext = os.path.splitext(path)[1].lower() | |
mime = mimetypes.guess_type(path)[0] or "" | |
text = "" | |
try: | |
if ext == ".pdf" or "pdf" in mime: | |
import pdfplumber | |
with pdfplumber.open(path) as pdf: | |
pages = pdf.pages[:5] # ๋ฐ๋ชจ: 5์ชฝ ์ ํ | |
text = "\n".join(p.extract_text() or "" for p in pages) | |
else: | |
# ์ด๋ฏธ์ง์ ๊ฒฝ์ฐ Tesseract ํ์ | |
try: | |
from PIL import Image | |
import pytesseract | |
text = pytesseract.image_to_string(Image.open(path)) | |
except Exception: | |
return "โ ๏ธ ์๋ฒ์ Tesseract OCR๊ฐ ์์ด์ ์ด๋ฏธ์ง OCR์ ์ง์๋์ง ์์ต๋๋ค. PDF๋ง ์ฌ์ฉํ์ธ์.", "" | |
except Exception as e: | |
return f"โ ํ ์คํธ ์ถ์ถ ์คํจ: {e}", "" | |
text = text.strip() | |
if not text: | |
return "โ ๏ธ ํ ์คํธ๊ฐ ์ถ์ถ๋์ง ์์์ต๋๋ค.", "" | |
translated = _gpt_translate(text, src, tgt) | |
return text, translated | |
# โโโโโโโโโโโโโโโโโโโ 4. ์ค์๊ฐ 1๊ฐ ์ธ์ด ๋ฒ์ญ โโโโโโโโโโโโโโโโโโ | |
def stream_single(mic_stream, src, tgt): | |
buf, header = io.BytesIO(), None | |
o_acc, t_acc = "", "" | |
while True: | |
chunk = mic_stream.recv() | |
if chunk is None: | |
break | |
if header is None: | |
header = chunk[:44] | |
buf.write(chunk) | |
if buf.getbuffer().nbytes > 16000*2*STREAM_SEC: | |
wav = header + buf.getvalue() | |
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp: | |
tmp.write(wav); tmp.close() | |
o, t, _ = translate_audio(tmp.name, src, tgt) | |
o_acc += " " + o | |
t_acc += " " + t | |
yield o_acc.strip(), t_acc.strip() | |
buf = io.BytesIO() | |
if buf.getbuffer().nbytes: | |
wav = header + buf.getvalue() | |
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp: | |
tmp.write(wav); tmp.close() | |
o, t, _ = translate_audio(tmp.name, src, tgt) | |
yield (o_acc+" "+o).strip(), (t_acc+" "+t).strip() | |
# โโโโโโโโโโโโโโโโโโโ 5. ์ค์๊ฐ 4๊ฐ ์ธ์ด ๋ฒ์ญ โโโโโโโโโโโโโโโโโโ | |
def stream_multi(mic_stream, src): | |
buf, header = io.BytesIO(), None | |
acc = {lang:"" for lang in ["orig"]+FOUR_LANGS} | |
while True: | |
chunk = mic_stream.recv() | |
if chunk is None: | |
break | |
if header is None: | |
header = chunk[:44] | |
buf.write(chunk) | |
if buf.getbuffer().nbytes > 16000*2*STREAM_SEC: | |
wav = header + buf.getvalue() | |
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp: | |
tmp.write(wav); tmp.close() | |
with open(tmp.name,"rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", file=f, | |
language=LANG_CODE.get(src) | |
) | |
orig = stt.text.strip() | |
if orig: | |
acc["orig"] += " " + orig | |
for lang in FOUR_LANGS: | |
acc[lang] += " " + _gpt_translate(orig, src, lang) | |
yield (acc["orig"].strip(), | |
acc["English"].strip(), | |
acc["Chinese"].strip(), | |
acc["Thai"].strip(), | |
acc["Russian"].strip()) | |
buf = io.BytesIO() | |
if buf.getbuffer().nbytes: | |
wav = header + buf.getvalue() | |
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp: | |
tmp.write(wav); tmp.close() | |
with open(tmp.name,"rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", file=f, | |
language=LANG_CODE.get(src) | |
) | |
orig = stt.text.strip() | |
if orig: | |
acc["orig"] += " " + orig | |
for lang in FOUR_LANGS: | |
acc[lang] += " " + _gpt_translate(orig, src, lang) | |
yield (acc["orig"].strip(), | |
acc["English"].strip(), | |
acc["Chinese"].strip(), | |
acc["Thai"].strip(), | |
acc["Russian"].strip()) | |
# โโโโโโโโโโโโโโโโโโโ 6. Gradio UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app: | |
with gr.Tabs(): | |
# ํญ 1 โ ์ค๋์ค ๋ฒ์ญ | |
with gr.TabItem("๐๏ธ ์ค๋์ค ๋ฒ์ญ"): | |
src1 = gr.Dropdown(LANGUAGES,value="Korean",label="์ ๋ ฅ") | |
tgt1 = gr.Dropdown(LANGUAGES,value="English",label="์ถ๋ ฅ") | |
aud1 = gr.Audio(sources=["microphone","upload"], | |
type="filepath", | |
label="๋ น์ ๋๋ ์ค๋์ค ํ์ผ ์ ๋ก๋") | |
btn1 = gr.Button("๋ฒ์ญ") | |
stt1 = gr.Textbox(label="์๋ฌธ", lines=5) | |
tlt1 = gr.Textbox(label="๋ฒ์ญ", lines=5) | |
out1 = gr.Audio(label="TTS",type="filepath",autoplay=True) | |
btn1.click(translate_audio,[aud1,src1,tgt1],[stt1,tlt1,out1]) | |
# ํญ 2 โ ๋ฌธ์/์ด๋ฏธ์ง ๋ฒ์ญ | |
with gr.TabItem("๐ ๋ฌธ์/์ด๋ฏธ์ง ๋ฒ์ญ"): | |
src2 = gr.Dropdown(LANGUAGES,value="Korean",label="์ ๋ ฅ") | |
tgt2 = gr.Dropdown(LANGUAGES,value="English",label="์ถ๋ ฅ") | |
file2= gr.File(label="PDF ๋๋ ์ด๋ฏธ์ง ์ ๋ก๋", | |
file_types=[".pdf",".png",".jpg",".jpeg",".bmp",".tiff",".gif"]) | |
btn2 = gr.Button("๋ฒ์ญ") | |
org2 = gr.Textbox(label="์ถ์ถ ์๋ฌธ",lines=15) | |
trs2 = gr.Textbox(label="๋ฒ์ญ ๊ฒฐ๊ณผ",lines=15) | |
btn2.click(translate_document,[file2,src2,tgt2],[org2,trs2]) | |
# ํญ 3 โ ์ค์๊ฐ 1์ธ์ด ๋ฒ์ญ | |
with gr.TabItem("โฑ๏ธ ์ค์๊ฐ 1์ธ์ด"): | |
src3 = gr.Dropdown(LANGUAGES,value="Korean",label="์ ๋ ฅ") | |
tgt3 = gr.Dropdown(LANGUAGES,value="English",label="์ถ๋ ฅ") | |
mic3 = gr.Audio(sources=["microphone"], | |
streaming=True, | |
label="์ค์๊ฐ ๋ง์ดํฌ") | |
stt3 = gr.Textbox(label="์๋ฌธ(์ค์๊ฐ)",lines=8) | |
tlt3 = gr.Textbox(label="๋ฒ์ญ(์ค์๊ฐ)",lines=8) | |
mic3.stream(stream_single,inputs=[src3,tgt3],outputs=[stt3,tlt3]) | |
# ํญ 4 โ ์ค์๊ฐ 4๊ฐ ์ธ์ด ๋ฒ์ญ | |
with gr.TabItem("๐ ์ค์๊ฐ 4๊ฐ ์ธ์ด"): | |
gr.Markdown("๋ง์ดํฌ ์ ๋ ฅ์ 3-4 ์ด ๊ฐ๊ฒฉ์ผ๋ก **English / Chinese(็ฎไฝ) / Thai / Russian** 4๊ฐ ์ธ์ด๋ก ๋์ ๋ฒ์ญํฉ๋๋ค.") | |
src4 = gr.Dropdown(LANGUAGES,value="Korean",label="์ ๋ ฅ ์ธ์ด") | |
mic4 = gr.Audio(sources=["microphone"], | |
streaming=True, | |
label="์ค์๊ฐ ๋ง์ดํฌ") | |
o4 = gr.Textbox(label="์๋ฌธ",lines=8) | |
e4 = gr.Textbox(label="English",lines=8) | |
z4 = gr.Textbox(label="Chinese(็ฎไฝ)",lines=8) | |
t4 = gr.Textbox(label="Thai",lines=8) | |
r4 = gr.Textbox(label="Russian",lines=8) | |
mic4.stream(stream_multi,inputs=[src4], | |
outputs=[o4,e4,z4,t4,r4]) | |
# โโโโโโโโโโโโโโโโโโโ 7. ์คํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
if __name__ == "__main__": | |
app.launch(server_name="0.0.0.0", | |
server_port=7860, | |
share=False, | |
debug=True) | |