Spaces:
Running
on
Zero
Running
on
Zero
""" | |
SMARTok ๋ฐ๋ชจ โ ์ต์ข ์์ ํ (2025-06-09) | |
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
โ ํญ1 ๐๏ธ ์ค๋์ค ๋ฒ์ญ : ๋ง์ดํฌ/ํ์ผ โ ๋ฒ์ญ + TTS | |
โ ํญ2 ๐ ๋ฌธ์โง์ด๋ฏธ์ง ๋ฒ์ญ : PDF / ์ด๋ฏธ์ง(OCR) โ ๋ฒ์ญ | |
โ ํญ3 โฑ๏ธ ์ค์๊ฐ 1์ธ์ด ๋ฒ์ญ : ๋ง์ดํฌ โ ์ค์๊ฐ ์๋ง(1๊ฐ ์ธ์ด) | |
โ ํญ4 ๐ ์ค์๊ฐ 4์ธ์ด ๋ฒ์ญ : ๋ง์ดํฌ โ ์ยท์คยทํยท๋ฌ ๋์ ์๋ง | |
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
ํ์ APT ํจํค์ง (packages.txt) | |
tesseract-ocr | |
libtesseract-dev | |
ghostscript | |
tesseract-ocr-kor tesseract-ocr-eng | |
tesseract-ocr-rus tesseract-ocr-tha | |
tesseract-ocr-chi-sim | |
ffmpeg | |
ํ์ PIP ํจํค์ง (requirements.txt) | |
gradio>=5.33 | |
openai | |
python-dotenv | |
pdfplumber | |
ocrmypdf | |
pytesseract | |
pillow | |
""" | |
import gradio as gr | |
import openai, os, io, tempfile, mimetypes | |
from dotenv import load_dotenv | |
from PIL import Image | |
import pdfplumber, ocrmypdf, pytesseract | |
# โโโโโโโโโโโโโโโโโโโ 0. ์ด๊ธฐํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
load_dotenv() | |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
if not OPENAI_API_KEY: | |
raise RuntimeError("OPENAI_API_KEY ๊ฐ .env ์ ์์ต๋๋ค!") | |
client = openai.OpenAI(api_key=OPENAI_API_KEY) | |
LANGUAGES = [ | |
"Korean", "English", "Japanese", "Chinese", | |
"Thai", "Russian", "Vietnamese", | |
"Spanish", "French" | |
] | |
LANG_CODE = { | |
"Korean": "kor", "English": "eng", "Japanese": "jpn", "Chinese": "chi_sim", | |
"Thai": "tha", "Russian": "rus", "Vietnamese": "vie", | |
"Spanish": "spa", "French": "fra" | |
} | |
VOICE = {l: ("nova" if l in ["Korean", "Japanese", "Chinese"] else "alloy") | |
for l in LANGUAGES} | |
FOUR = ["English", "Chinese", "Thai", "Russian"] # ๋์ ๋ฒ์ญ ๋์ | |
STREAM_SEC = 4 # ์ค์๊ฐ ์ฒญํฌ ๊ธธ์ด | |
# โโโโโโโโโโโโโโโโโโโ 1. ๊ณตํต ํจ์ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
def _safe(val): | |
"""Gradio File/Audio โ ๊ฒฝ๋ก""" | |
if val is None: | |
return None | |
return val["name"] if isinstance(val, dict) else val | |
def _gpt_translate(text: str, src: str, tgt: str) -> str: | |
"""GPT-3.5 ๋ฒ์ญ""" | |
resp = client.chat.completions.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", | |
"content": f"Translate the following {src} text to {tgt}. " | |
"Return only the translated text."}, | |
{"role": "user", "content": text} | |
], | |
temperature=0.3, | |
max_tokens=4096 | |
) | |
return resp.choices[0].message.content.strip() | |
def _tts(text: str, lang: str) -> str: | |
"""ํ ์คํธ๋ฅผ mp3(TTS-1)๋ก ๋ณํ ํ ๊ฒฝ๋ก ๋ฐํ""" | |
out = client.audio.speech.create( | |
model="tts-1", | |
voice=VOICE.get(lang, "alloy"), | |
input=text[:4096] | |
) | |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") | |
tmp.write(out.content) | |
tmp.close() | |
return tmp.name | |
# โโโโโโโโโโโโโโโโโโโ 2. ์ค๋์ค(๋จ๊ฑด) ๋ฒ์ญ โโโโโโโโโโโโโโโโโโโโโโโโโ | |
def translate_audio(audio_in, src, tgt): | |
path = _safe(audio_in) | |
if not path or not os.path.exists(path): | |
return "โ ๏ธ ์์ฑ ํ์ผ์ ๋ น์ํ๊ฑฐ๋ ์ ๋ก๋ํ์ธ์.", "", None | |
with open(path, "rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", file=f, language=LANG_CODE.get(src, "eng") | |
) | |
origin = stt.text.strip() | |
if not origin: | |
return "โ ๏ธ ์์ฑ ์ธ์ ์คํจ", "", None | |
translated = _gpt_translate(origin, src, tgt) | |
tts_path = _tts(translated, tgt) | |
return origin, translated, tts_path | |
# โโโโโโโโโโโโโโโโโโโ 3. ๋ฌธ์/์ด๋ฏธ์ง ๋ฒ์ญ โโโโโโโโโโโโโโโโโโโโโโโโโ | |
def translate_doc(file_in, src, tgt): | |
path = _safe(file_in) | |
if not path or not os.path.exists(path): | |
return "โ ๏ธ PDF ๋๋ ์ด๋ฏธ์ง๋ฅผ ์ ๋ก๋ํ์ธ์.", "" | |
ext = os.path.splitext(path)[1].lower() | |
mime = mimetypes.guess_type(path)[0] or "" | |
text = "" | |
try: | |
# (A) PDF ์ง์ ํ ์คํธ ์ถ์ถ | |
if ext == ".pdf" or "pdf" in mime: | |
with pdfplumber.open(path) as pdf: | |
text = "\n".join(page.extract_text() or "" for page in pdf.pages[:5]) | |
# (B) ์ด๋ฏธ์ง โ OCR PDF โ ํ ์คํธ | |
else: | |
tmp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name | |
Image.open(path).save(tmp_pdf, "PDF") | |
ocr_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name | |
try: | |
# OCR ๋ ์ด์ด ์ฝ์ (์ธ์ด ๋ฐ์ดํฐ ์๋ ๊ฒฝ์ฐ ์คํจํ ์ ์์) | |
ocrmypdf.ocr( | |
tmp_pdf, ocr_pdf, | |
lang=f"{LANG_CODE.get(src,'eng')}+eng", | |
deskew=True, optimize=0, progress_bar=False | |
) | |
with pdfplumber.open(ocr_pdf) as pdf: | |
text = "\n".join(p.extract_text() or "" for p in pdf.pages) | |
except Exception: | |
# ocrmypdf ์คํจ โ pytesseract ์ง์ | |
text = pytesseract.image_to_string( | |
Image.open(path), lang=LANG_CODE.get(src, "eng") | |
) | |
except Exception as e: | |
return f"โ ํ ์คํธ ์ถ์ถ ์ค๋ฅ: {e}", "" | |
text = text.strip() | |
if not text: | |
return "โ ๏ธ ํ ์คํธ๋ฅผ ์ถ์ถํ์ง ๋ชปํ์ต๋๋ค.", "" | |
return text, _gpt_translate(text, src, tgt) | |
# โโโโโโโโโโโโโโโโโโโ 4. ์ค์๊ฐ 1์ธ์ด ์คํธ๋ฆผ โโโโโโโโโโโโโโโโโโโโโโ | |
def stream_one(audio_path, src, tgt, state): | |
"""state = {'orig': str, 'trans': str}""" | |
state = state or {"orig": "", "trans": ""} | |
if not audio_path or not os.path.exists(audio_path): | |
return state["orig"], state["trans"], state | |
with open(audio_path, "rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", file=f, language=LANG_CODE.get(src, "eng") | |
) | |
full = stt.text.strip() | |
new = full[len(state["orig"]):] | |
if new: | |
state["orig"] = full | |
state["trans"] += " " + _gpt_translate(new, src, tgt) | |
return state["orig"], state["trans"].strip(), state | |
# โโโโโโโโโโโโโโโโโโโ 5. ์ค์๊ฐ 4์ธ์ด ์คํธ๋ฆผ โโโโโโโโโโโโโโโโโโโโโโ | |
def stream_four(audio_path, src, state): | |
""" | |
state keys: orig / English / Chinese / Thai / Russian | |
""" | |
state = state or {k: "" for k in ["orig"] + FOUR} | |
if not audio_path or not os.path.exists(audio_path): | |
return (state["orig"], state["English"], state["Chinese"], | |
state["Thai"], state["Russian"], state) | |
with open(audio_path, "rb") as f: | |
stt = client.audio.transcriptions.create( | |
model="whisper-1", file=f, language=LANG_CODE.get(src, "eng") | |
) | |
full = stt.text.strip() | |
new = full[len(state["orig"]):] | |
if new: | |
state["orig"] = full | |
for tgt in FOUR: | |
state[tgt] += " " + _gpt_translate(new, src, tgt) | |
return (state["orig"].strip(), state["English"].strip(), state["Chinese"].strip(), | |
state["Thai"].strip(), state["Russian"].strip(), state) | |
# โโโโโโโโโโโโโโโโโโโ 6. Gradio UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app: | |
with gr.Tabs(): | |
# ํญ 1 โ ์ค๋์ค ๋ฒ์ญ | |
with gr.TabItem("๐๏ธ ์ค๋์ค ๋ฒ์ญ"): | |
s1 = gr.Dropdown(LANGUAGES, value="Korean", label="์ ๋ ฅ ์ธ์ด") | |
t1 = gr.Dropdown(LANGUAGES, value="English", label="์ถ๋ ฅ ์ธ์ด") | |
aud1 = gr.Audio(sources=["microphone", "upload"], type="filepath") | |
btn1 = gr.Button("๋ฒ์ญ") | |
o1 = gr.Textbox(label="์๋ฌธ", lines=5) | |
tr1 = gr.Textbox(label="๋ฒ์ญ", lines=5) | |
a1 = gr.Audio(label="TTS", type="filepath", autoplay=True) | |
btn1.click(translate_audio, [aud1, s1, t1], [o1, tr1, a1]) | |
# ํญ 2 โ ๋ฌธ์ยท์ด๋ฏธ์ง ๋ฒ์ญ | |
with gr.TabItem("๐ ๋ฌธ์ยท์ด๋ฏธ์ง ๋ฒ์ญ"): | |
s2 = gr.Dropdown(LANGUAGES, value="Korean", label="์ ๋ ฅ ์ธ์ด") | |
t2 = gr.Dropdown(LANGUAGES, value="English", label="์ถ๋ ฅ ์ธ์ด") | |
file2 = gr.File(label="PDF / ์ด๋ฏธ์ง ์ ๋ก๋", | |
file_types=[".pdf", ".png", ".jpg", ".jpeg", | |
".bmp", ".tiff", ".gif"]) | |
btn2 = gr.Button("๋ฒ์ญ") | |
o2 = gr.Textbox(label="์ถ์ถ ์๋ฌธ", lines=15) | |
tr2 = gr.Textbox(label="๋ฒ์ญ ๊ฒฐ๊ณผ", lines=15) | |
btn2.click(translate_doc, [file2, s2, t2], [o2, tr2]) | |
# ํญ 3 โ ์ค์๊ฐ 1์ธ์ด ๋ฒ์ญ | |
with gr.TabItem("โฑ๏ธ ์ค์๊ฐ 1์ธ์ด"): | |
s3 = gr.Dropdown(LANGUAGES, value="Korean", label="์ ๋ ฅ ์ธ์ด") | |
t3 = gr.Dropdown(LANGUAGES, value="English", label="์ถ๋ ฅ ์ธ์ด") | |
mic3 = gr.Audio(sources=["microphone"], streaming=True) | |
o3 = gr.Textbox(label="์๋ฌธ(์ค์๊ฐ)", lines=8) | |
tr3 = gr.Textbox(label="๋ฒ์ญ(์ค์๊ฐ)", lines=8) | |
st3 = gr.State() | |
mic3.stream(stream_one, inputs=[s3, t3, st3], | |
outputs=[o3, tr3, st3]) | |
# ํญ 4 โ ์ค์๊ฐ 4์ธ์ด ๋ฒ์ญ | |
with gr.TabItem("๐ ์ค์๊ฐ 4์ธ์ด"): | |
s4 = gr.Dropdown(LANGUAGES, value="Korean", label="์ ๋ ฅ ์ธ์ด") | |
mic4 = gr.Audio(sources=["microphone"], streaming=True) | |
o4 = gr.Textbox(label="์๋ฌธ", lines=8) | |
e4 = gr.Textbox(label="English", lines=8) | |
c4 = gr.Textbox(label="Chinese(็ฎไฝ)", lines=8) | |
th4 = gr.Textbox(label="Thai", lines=8) | |
r4 = gr.Textbox(label="Russian", lines=8) | |
st4 = gr.State() | |
mic4.stream(stream_four, inputs=[s4, st4], | |
outputs=[o4, e4, c4, th4, r4, st4]) | |
# โโโโโโโโโโโโโโโโโโโ 7. ์คํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
if __name__ == "__main__": | |
app.launch(server_name="0.0.0.0", | |
server_port=7860, | |
share=False, | |
debug=True) | |