voice-trans / app.py
openfree's picture
Update app.py
0fdb888 verified
raw
history blame
11 kB
"""
SMARTok ๋ฐ๋ชจ โ€“ ์ตœ์ข… ์•ˆ์ •ํŒ (2025-06-09)
โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
โ— ํƒญ1 ๐ŸŽ™๏ธ ์˜ค๋””์˜ค ๋ฒˆ์—ญ : ๋งˆ์ดํฌ/ํŒŒ์ผ โ†’ ๋ฒˆ์—ญ + TTS
โ— ํƒญ2 ๐Ÿ“„ ๋ฌธ์„œโ€ง์ด๋ฏธ์ง€ ๋ฒˆ์—ญ : PDF / ์ด๋ฏธ์ง€(OCR) โ†’ ๋ฒˆ์—ญ
โ— ํƒญ3 โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด ๋ฒˆ์—ญ : ๋งˆ์ดํฌ โ†’ ์‹ค์‹œ๊ฐ„ ์ž๋ง‰(1๊ฐœ ์–ธ์–ด)
โ— ํƒญ4 ๐ŸŒ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด ๋ฒˆ์—ญ : ๋งˆ์ดํฌ โ†’ ์˜ยท์ค‘ยทํƒœยท๋Ÿฌ ๋™์‹œ ์ž๋ง‰
โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
ํ•„์ˆ˜ APT ํŒจํ‚ค์ง€ (packages.txt)
tesseract-ocr
libtesseract-dev
ghostscript
tesseract-ocr-kor tesseract-ocr-eng
tesseract-ocr-rus tesseract-ocr-tha
tesseract-ocr-chi-sim
ffmpeg
ํ•„์ˆ˜ PIP ํŒจํ‚ค์ง€ (requirements.txt)
gradio>=5.33
openai
python-dotenv
pdfplumber
ocrmypdf
pytesseract
pillow
"""
import gradio as gr
import openai, os, io, tempfile, mimetypes
from dotenv import load_dotenv
from PIL import Image
import pdfplumber, ocrmypdf, pytesseract
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 0. ์ดˆ๊ธฐํ™” โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
raise RuntimeError("OPENAI_API_KEY ๊ฐ€ .env ์— ์—†์Šต๋‹ˆ๋‹ค!")
client = openai.OpenAI(api_key=OPENAI_API_KEY)
LANGUAGES = [
"Korean", "English", "Japanese", "Chinese",
"Thai", "Russian", "Vietnamese",
"Spanish", "French"
]
LANG_CODE = {
"Korean": "kor", "English": "eng", "Japanese": "jpn", "Chinese": "chi_sim",
"Thai": "tha", "Russian": "rus", "Vietnamese": "vie",
"Spanish": "spa", "French": "fra"
}
VOICE = {l: ("nova" if l in ["Korean", "Japanese", "Chinese"] else "alloy")
for l in LANGUAGES}
FOUR = ["English", "Chinese", "Thai", "Russian"] # ๋™์‹œ ๋ฒˆ์—ญ ๋Œ€์ƒ
STREAM_SEC = 4 # ์‹ค์‹œ๊ฐ„ ์ฒญํฌ ๊ธธ์ด
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 1. ๊ณตํ†ต ํ•จ์ˆ˜ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _safe(val):
"""Gradio File/Audio โ†’ ๊ฒฝ๋กœ"""
if val is None:
return None
return val["name"] if isinstance(val, dict) else val
def _gpt_translate(text: str, src: str, tgt: str) -> str:
"""GPT-3.5 ๋ฒˆ์—ญ"""
resp = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system",
"content": f"Translate the following {src} text to {tgt}. "
"Return only the translated text."},
{"role": "user", "content": text}
],
temperature=0.3,
max_tokens=4096
)
return resp.choices[0].message.content.strip()
def _tts(text: str, lang: str) -> str:
"""ํ…์ŠคํŠธ๋ฅผ mp3(TTS-1)๋กœ ๋ณ€ํ™˜ ํ›„ ๊ฒฝ๋กœ ๋ฐ˜ํ™˜"""
out = client.audio.speech.create(
model="tts-1",
voice=VOICE.get(lang, "alloy"),
input=text[:4096]
)
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tmp.write(out.content)
tmp.close()
return tmp.name
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 2. ์˜ค๋””์˜ค(๋‹จ๊ฑด) ๋ฒˆ์—ญ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def translate_audio(audio_in, src, tgt):
path = _safe(audio_in)
if not path or not os.path.exists(path):
return "โš ๏ธ ์Œ์„ฑ ํŒŒ์ผ์„ ๋…น์Œํ•˜๊ฑฐ๋‚˜ ์—…๋กœ๋“œํ•˜์„ธ์š”.", "", None
with open(path, "rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1", file=f, language=LANG_CODE.get(src, "eng")
)
origin = stt.text.strip()
if not origin:
return "โš ๏ธ ์Œ์„ฑ ์ธ์‹ ์‹คํŒจ", "", None
translated = _gpt_translate(origin, src, tgt)
tts_path = _tts(translated, tgt)
return origin, translated, tts_path
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 3. ๋ฌธ์„œ/์ด๋ฏธ์ง€ ๋ฒˆ์—ญ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def translate_doc(file_in, src, tgt):
path = _safe(file_in)
if not path or not os.path.exists(path):
return "โš ๏ธ PDF ๋˜๋Š” ์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”.", ""
ext = os.path.splitext(path)[1].lower()
mime = mimetypes.guess_type(path)[0] or ""
text = ""
try:
# (A) PDF ์ง์ ‘ ํ…์ŠคํŠธ ์ถ”์ถœ
if ext == ".pdf" or "pdf" in mime:
with pdfplumber.open(path) as pdf:
text = "\n".join(page.extract_text() or "" for page in pdf.pages[:5])
# (B) ์ด๋ฏธ์ง€ โ†’ OCR PDF โ†’ ํ…์ŠคํŠธ
else:
tmp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
Image.open(path).save(tmp_pdf, "PDF")
ocr_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
try:
# OCR ๋ ˆ์ด์–ด ์‚ฝ์ž… (์–ธ์–ด ๋ฐ์ดํ„ฐ ์—†๋Š” ๊ฒฝ์šฐ ์‹คํŒจํ•  ์ˆ˜ ์žˆ์Œ)
ocrmypdf.ocr(
tmp_pdf, ocr_pdf,
lang=f"{LANG_CODE.get(src,'eng')}+eng",
deskew=True, optimize=0, progress_bar=False
)
with pdfplumber.open(ocr_pdf) as pdf:
text = "\n".join(p.extract_text() or "" for p in pdf.pages)
except Exception:
# ocrmypdf ์‹คํŒจ โ†’ pytesseract ์ง์ ‘
text = pytesseract.image_to_string(
Image.open(path), lang=LANG_CODE.get(src, "eng")
)
except Exception as e:
return f"โŒ ํ…์ŠคํŠธ ์ถ”์ถœ ์˜ค๋ฅ˜: {e}", ""
text = text.strip()
if not text:
return "โš ๏ธ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.", ""
return text, _gpt_translate(text, src, tgt)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 4. ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด ์ŠคํŠธ๋ฆผ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def stream_one(audio_path, src, tgt, state):
"""state = {'orig': str, 'trans': str}"""
state = state or {"orig": "", "trans": ""}
if not audio_path or not os.path.exists(audio_path):
return state["orig"], state["trans"], state
with open(audio_path, "rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1", file=f, language=LANG_CODE.get(src, "eng")
)
full = stt.text.strip()
new = full[len(state["orig"]):]
if new:
state["orig"] = full
state["trans"] += " " + _gpt_translate(new, src, tgt)
return state["orig"], state["trans"].strip(), state
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 5. ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด ์ŠคํŠธ๋ฆผ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def stream_four(audio_path, src, state):
"""
state keys: orig / English / Chinese / Thai / Russian
"""
state = state or {k: "" for k in ["orig"] + FOUR}
if not audio_path or not os.path.exists(audio_path):
return (state["orig"], state["English"], state["Chinese"],
state["Thai"], state["Russian"], state)
with open(audio_path, "rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1", file=f, language=LANG_CODE.get(src, "eng")
)
full = stt.text.strip()
new = full[len(state["orig"]):]
if new:
state["orig"] = full
for tgt in FOUR:
state[tgt] += " " + _gpt_translate(new, src, tgt)
return (state["orig"].strip(), state["English"].strip(), state["Chinese"].strip(),
state["Thai"].strip(), state["Russian"].strip(), state)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 6. Gradio UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
with gr.Tabs():
# ํƒญ 1 โ”€ ์˜ค๋””์˜ค ๋ฒˆ์—ญ
with gr.TabItem("๐ŸŽ™๏ธ ์˜ค๋””์˜ค ๋ฒˆ์—ญ"):
s1 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
t1 = gr.Dropdown(LANGUAGES, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
aud1 = gr.Audio(sources=["microphone", "upload"], type="filepath")
btn1 = gr.Button("๋ฒˆ์—ญ")
o1 = gr.Textbox(label="์›๋ฌธ", lines=5)
tr1 = gr.Textbox(label="๋ฒˆ์—ญ", lines=5)
a1 = gr.Audio(label="TTS", type="filepath", autoplay=True)
btn1.click(translate_audio, [aud1, s1, t1], [o1, tr1, a1])
# ํƒญ 2 โ”€ ๋ฌธ์„œยท์ด๋ฏธ์ง€ ๋ฒˆ์—ญ
with gr.TabItem("๐Ÿ“„ ๋ฌธ์„œยท์ด๋ฏธ์ง€ ๋ฒˆ์—ญ"):
s2 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
t2 = gr.Dropdown(LANGUAGES, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
file2 = gr.File(label="PDF / ์ด๋ฏธ์ง€ ์—…๋กœ๋“œ",
file_types=[".pdf", ".png", ".jpg", ".jpeg",
".bmp", ".tiff", ".gif"])
btn2 = gr.Button("๋ฒˆ์—ญ")
o2 = gr.Textbox(label="์ถ”์ถœ ์›๋ฌธ", lines=15)
tr2 = gr.Textbox(label="๋ฒˆ์—ญ ๊ฒฐ๊ณผ", lines=15)
btn2.click(translate_doc, [file2, s2, t2], [o2, tr2])
# ํƒญ 3 โ”€ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด ๋ฒˆ์—ญ
with gr.TabItem("โฑ๏ธ ์‹ค์‹œ๊ฐ„ 1์–ธ์–ด"):
s3 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
t3 = gr.Dropdown(LANGUAGES, value="English", label="์ถœ๋ ฅ ์–ธ์–ด")
mic3 = gr.Audio(sources=["microphone"], streaming=True)
o3 = gr.Textbox(label="์›๋ฌธ(์‹ค์‹œ๊ฐ„)", lines=8)
tr3 = gr.Textbox(label="๋ฒˆ์—ญ(์‹ค์‹œ๊ฐ„)", lines=8)
st3 = gr.State()
mic3.stream(stream_one, inputs=[s3, t3, st3],
outputs=[o3, tr3, st3])
# ํƒญ 4 โ”€ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด ๋ฒˆ์—ญ
with gr.TabItem("๐ŸŒ ์‹ค์‹œ๊ฐ„ 4์–ธ์–ด"):
s4 = gr.Dropdown(LANGUAGES, value="Korean", label="์ž…๋ ฅ ์–ธ์–ด")
mic4 = gr.Audio(sources=["microphone"], streaming=True)
o4 = gr.Textbox(label="์›๋ฌธ", lines=8)
e4 = gr.Textbox(label="English", lines=8)
c4 = gr.Textbox(label="Chinese(็ฎ€ไฝ“)", lines=8)
th4 = gr.Textbox(label="Thai", lines=8)
r4 = gr.Textbox(label="Russian", lines=8)
st4 = gr.State()
mic4.stream(stream_four, inputs=[s4, st4],
outputs=[o4, e4, c4, th4, r4, st4])
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ 7. ์‹คํ–‰ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
if __name__ == "__main__":
app.launch(server_name="0.0.0.0",
server_port=7860,
share=False,
debug=True)