Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,517 Bytes
effad1c 6b6f26e effad1c b3067c5 6b6f26e 6bdc489 6b6f26e b3067c5 6b6f26e 6bdc489 a609646 6bdc489 32b3c75 a609646 32b3c75 5897b48 392a5eb 5897b48 effad1c fd022eb 6b6f26e fd022eb 6b6f26e effad1c 6b6f26e 392a5eb 6b6f26e a609646 392a5eb 6b6f26e 392a5eb a609646 effad1c 5897b48 a609646 6b6f26e 5897b48 392a5eb 5897b48 6b6f26e a609646 6b6f26e fd022eb 6b6f26e a609646 6b6f26e a609646 6b6f26e fd022eb 6b6f26e fd022eb 6b6f26e fd022eb 6b6f26e 7cce69a 6b6f26e 392a5eb 6b6f26e 392a5eb 6b6f26e 392a5eb 6b6f26e b3067c5 6b6f26e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
"""
SMARTok ์ค์๊ฐ ๋ค๊ตญ์ด ๋ฐ๋ชจ (์์ ์์ ๋ณธ)
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โข ํญ1 ๐๏ธ ์ค๋์ค ๋ฒ์ญ : ๋ง์ดํฌ/ํ์ผ โ ๋ฒ์ญ + TTS
โข ํญ2 ๐ ๋ฌธ์ยท์ด๋ฏธ์ง ๋ฒ์ญ : PDF / ์ด๋ฏธ์ง(OCR) โ ๋ฒ์ญ
โข ํญ3 โฑ๏ธ ์ค์๊ฐ 1์ธ์ด ๋ฒ์ญ : ๋ง์ดํฌ โ 1๊ฐ ์ธ์ด ์ค์๊ฐ ์๋ง
โข ํญ4 ๐ ์ค์๊ฐ 4๊ฐ ์ธ์ด ๋ฒ์ญ : ๋ง์ดํฌ โ ์ยท์คยทํยท๋ฌ ๋์ ์๋ง
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
ํ์ apt : tesseract-ocr libtesseract-dev ocrmypdf ffmpeg
ํ์ pip : gradio>=5.33 openai python-dotenv pdfplumber ocrmypdf pillow
"""
import gradio as gr
import openai, os, io, tempfile, mimetypes, json, uuid
from dotenv import load_dotenv
import pdfplumber, ocrmypdf
from PIL import Image
# โโโโโโโโโโโโโโ 0. ๊ณตํต ์ด๊ธฐํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY๋ฅผ .env ํ์ผ์ ์ค์ ํ์ธ์!")
client = openai.OpenAI(api_key=api_key)
LANGUAGES = [
"Korean", "English", "Japanese", "Chinese",
"Thai", "Russian", "Vietnamese",
"Spanish", "French"
]
LANG_CODE = {
"Korean":"ko","English":"en","Japanese":"ja","Chinese":"zh",
"Thai":"th","Russian":"ru","Vietnamese":"vi",
"Spanish":"es","French":"fr"
}
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
for l in LANGUAGES}
FOUR = ["English","Chinese","Thai","Russian"]
STREAM_SEC = 4 # Whisper ํธ์ถ ์ฃผ๊ธฐ
# โโโโโโโโโโโโโโ 1. ์ ํธ ํจ์ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def _safe(v):
if v is None:
return None
return v["name"] if isinstance(v, dict) else v
def _gpt(text, src, tgt):
rsp = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role":"system",
"content":f"Translate the following {src} text to {tgt}. "
"Return only the translation."},
{"role":"user","content":text}
],
temperature=0.3,max_tokens=4096
)
return rsp.choices[0].message.content.strip()
def _tts(text, lang):
rsp = client.audio.speech.create(
model="tts-1",
voice=VOICE.get(lang,"alloy"),
input=text[:4096]
)
tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
tmp.write(rsp.content); tmp.close()
return tmp.name
# โโโโโโโโโโโโโโ 2. ์ค๋์ค(๋จ๊ฑด) ๋ฒ์ญ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def translate_audio(audio_in, src, tgt):
p = _safe(audio_in)
if not p or not os.path.exists(p):
return "โ ๏ธ ์์ฑ ํ์ผ ํ์", "", None
with open(p,"rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1", file=f, language=LANG_CODE.get(src))
orig = stt.text.strip()
if not orig:
return "โ ๏ธ ์์ฑ ์ธ์ ์คํจ", "", None
trans = _gpt(orig, src, tgt)
return orig, trans, _tts(trans, tgt)
# โโโโโโโโโโโโโโ 3. ๋ฌธ์ / ์ด๋ฏธ์ง ๋ฒ์ญ โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def translate_doc(file_in, src, tgt):
p = _safe(file_in)
if not p or not os.path.exists(p):
return "โ ๏ธ PDF/์ด๋ฏธ์ง ์
๋ก๋", ""
ext = os.path.splitext(p)[1].lower()
mime = mimetypes.guess_type(p)[0] or ""
try:
# PDF ๊ทธ๋๋ก
if ext==".pdf" or "pdf" in mime:
with pdfplumber.open(p) as pdf:
txt = "\n".join(pg.extract_text() or "" for pg in pdf.pages[:5])
# ์ด๋ฏธ์ง โ OCR PDF
else:
img_pdf = tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name
Image.open(p).save(img_pdf,"PDF")
ocr_pdf = tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name
ocrmypdf.ocr(img_pdf, ocr_pdf,
lang=LANG_CODE.get(src,"eng"),
deskew=True,optimize=0,progress_bar=False)
with pdfplumber.open(ocr_pdf) as pdf:
txt = "\n".join(pg.extract_text() or "" for pg in pdf.pages)
except Exception as e:
return f"โ ์ถ์ถ ์ค๋ฅ: {e}", ""
txt = txt.strip()
if not txt:
return "โ ๏ธ ํ
์คํธ ์ถ์ถ ์คํจ", ""
return txt, _gpt(txt, src, tgt)
# โโโโโโโโโโโโโโ 4. ์ค์๊ฐ 1์ธ์ด โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def stream_single(audio_path, src, tgt, state):
state = state or {"orig":"", "trans":""}
if not audio_path or not os.path.exists(audio_path):
return state["orig"], state["trans"], state
with open(audio_path,"rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1", file=f, language=LANG_CODE.get(src))
full = stt.text.strip()
new = full[len(state["orig"]):]
if new:
state["orig"] = full
state["trans"] += " " + _gpt(new, src, tgt)
return state["orig"], state["trans"].strip(), state
# โโโโโโโโโโโโโโ 5. ์ค์๊ฐ 4์ธ์ด โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def stream_multi(audio_path, src, state):
state = state or {k:"" for k in ["orig"]+FOUR}
if not audio_path or not os.path.exists(audio_path):
return state["orig"],state["English"],state["Chinese"],state["Thai"],state["Russian"],state
with open(audio_path,"rb") as f:
stt = client.audio.transcriptions.create(
model="whisper-1", file=f, language=LANG_CODE.get(src))
full = stt.text.strip()
new = full[len(state["orig"]):]
if new:
state["orig"] = full
for lang in FOUR:
state[lang] += " " + _gpt(new, src, lang)
return (state["orig"].strip(),
state["English"].strip(),
state["Chinese"].strip(),
state["Thai"].strip(),
state["Russian"].strip(),
state)
# โโโโโโโโโโโโโโ 6. Gradio UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
with gr.Tabs():
# ํญ 1
with gr.TabItem("๐๏ธ ์ค๋์ค ๋ฒ์ญ"):
src1 = gr.Dropdown(LANGUAGES,value="Korean",label="์
๋ ฅ ์ธ์ด")
tgt1 = gr.Dropdown(LANGUAGES,value="English",label="์ถ๋ ฅ ์ธ์ด")
aud1 = gr.Audio(sources=["microphone","upload"],type="filepath")
res1 = gr.Button("๋ฒ์ญ")
o1 = gr.Textbox(label="์๋ฌธ",lines=5)
t1 = gr.Textbox(label="๋ฒ์ญ",lines=5)
a1 = gr.Audio(label="TTS",type="filepath",autoplay=True)
res1.click(translate_audio,[aud1,src1,tgt1],[o1,t1,a1])
# ํญ 2
with gr.TabItem("๐ ๋ฌธ์ยท์ด๋ฏธ์ง ๋ฒ์ญ"):
src2 = gr.Dropdown(LANGUAGES,value="Korean",label="์
๋ ฅ ์ธ์ด")
tgt2 = gr.Dropdown(LANGUAGES,value="English",label="์ถ๋ ฅ ์ธ์ด")
file2= gr.File(label="PDF/์ด๋ฏธ์ง ์
๋ก๋",
file_types=[".pdf",".png",".jpg",".jpeg",
".bmp",".tiff",".gif"])
doc2 = gr.Button("๋ฒ์ญ")
o2 = gr.Textbox(label="์ถ์ถ ์๋ฌธ",lines=15)
t2 = gr.Textbox(label="๋ฒ์ญ ๊ฒฐ๊ณผ",lines=15)
doc2.click(translate_doc,[file2,src2,tgt2],[o2,t2])
# ํญ 3
with gr.TabItem("โฑ๏ธ ์ค์๊ฐ 1์ธ์ด"):
src3 = gr.Dropdown(LANGUAGES,value="Korean",label="์
๋ ฅ ์ธ์ด")
tgt3 = gr.Dropdown(LANGUAGES,value="English",label="์ถ๋ ฅ ์ธ์ด")
mic3 = gr.Audio(sources=["microphone"],streaming=True)
o3 = gr.Textbox(label="์๋ฌธ(์ค์๊ฐ)",lines=8)
t3 = gr.Textbox(label="๋ฒ์ญ(์ค์๊ฐ)",lines=8)
st3 = gr.State()
mic3.stream(stream_single,
inputs=[src3,tgt3,st3],
outputs=[o3,t3,st3])
# ํญ 4
with gr.TabItem("๐ ์ค์๊ฐ 4๊ฐ ์ธ์ด"):
src4 = gr.Dropdown(LANGUAGES,value="Korean",label="์
๋ ฅ ์ธ์ด")
mic4 = gr.Audio(sources=["microphone"],streaming=True)
o4 = gr.Textbox(label="์๋ฌธ",lines=8)
e4 = gr.Textbox(label="English",lines=8)
c4 = gr.Textbox(label="Chinese(็ฎไฝ)",lines=8)
th4 = gr.Textbox(label="Thai",lines=8)
r4 = gr.Textbox(label="Russian",lines=8)
st4 = gr.State()
mic4.stream(stream_multi,
inputs=[src4,st4],
outputs=[o4,e4,c4,th4,r4,st4])
# โโโโโโโโโโโโโโ 7. ์คํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
if __name__ == "__main__":
app.launch(server_name="0.0.0.0",server_port=7860,share=False,debug=True)
|