Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -2,69 +2,62 @@ import gradio as gr
|
|
2 |
import openai, os, io, tempfile, mimetypes
|
3 |
from dotenv import load_dotenv
|
4 |
|
5 |
-
#
|
6 |
load_dotenv()
|
7 |
api_key = os.getenv("OPENAI_API_KEY")
|
8 |
if not api_key:
|
9 |
raise RuntimeError("OPENAI_API_KEY를 .env 파일에 설정하세요!")
|
10 |
client = openai.OpenAI(api_key=api_key)
|
11 |
|
12 |
-
# ============ 언어 매핑 ==============================================
|
13 |
LANGUAGES = [
|
14 |
"Korean", "English", "Japanese", "Chinese",
|
15 |
"Thai", "Russian", "Vietnamese",
|
16 |
"Spanish", "French"
|
17 |
]
|
18 |
LANG_CODE = {
|
19 |
-
"Korean":
|
20 |
-
"Thai":
|
21 |
-
"Spanish":
|
22 |
}
|
23 |
-
|
|
|
24 |
for l in LANGUAGES}
|
25 |
|
26 |
-
#
|
27 |
-
def _safe_path(
|
28 |
-
|
29 |
-
if
|
30 |
-
|
31 |
-
|
32 |
-
return x.get("name") # 4.x 형식
|
33 |
-
return x
|
34 |
-
|
35 |
-
def _gpt_translate(text: str, src: str, tgt: str) -> str:
|
36 |
-
"""GPT-3.5-Turbo 번역"""
|
37 |
rsp = client.chat.completions.create(
|
38 |
model="gpt-3.5-turbo",
|
39 |
messages=[
|
40 |
-
{"role":
|
41 |
-
"content":
|
42 |
f"Only provide the translated text."},
|
43 |
-
{"role":
|
44 |
],
|
45 |
-
temperature=0.3, max_tokens=
|
46 |
)
|
47 |
return rsp.choices[0].message.content.strip()
|
48 |
|
49 |
-
def _tts(text
|
50 |
-
"""OpenAI TTS-1 → mp3 파일 경로"""
|
51 |
out = client.audio.speech.create(
|
52 |
model="tts-1",
|
53 |
-
voice=VOICE.get(lang,
|
54 |
input=text[:4096]
|
55 |
)
|
56 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
|
57 |
-
tmp.write(out.content)
|
58 |
-
tmp.close()
|
59 |
return tmp.name
|
60 |
|
61 |
-
#
|
62 |
def translate_audio(audio_in, src, tgt):
|
63 |
path = _safe_path(audio_in)
|
64 |
if not path or not os.path.exists(path):
|
65 |
-
return "⚠️ 음성 파일을
|
66 |
|
67 |
-
with open(path,
|
68 |
stt = client.audio.transcriptions.create(
|
69 |
model="whisper-1",
|
70 |
file=f,
|
@@ -75,18 +68,18 @@ def translate_audio(audio_in, src, tgt):
|
|
75 |
return "⚠️ 음성 인식 실패", "", None
|
76 |
|
77 |
translated = _gpt_translate(original, src, tgt)
|
78 |
-
tts_path
|
79 |
return original, translated, tts_path
|
80 |
|
81 |
-
#
|
82 |
def translate_document(file_in, src, tgt):
|
83 |
path = _safe_path(file_in)
|
84 |
if not path or not os.path.exists(path):
|
85 |
return "⚠️ PDF 또는 이미지를 업로드하세요.", ""
|
86 |
|
87 |
-
text = ""
|
88 |
ext = os.path.splitext(path)[1].lower()
|
89 |
mime = mimetypes.guess_type(path)[0] or ""
|
|
|
90 |
|
91 |
try:
|
92 |
if ext == ".pdf" or "pdf" in mime:
|
@@ -94,14 +87,14 @@ def translate_document(file_in, src, tgt):
|
|
94 |
with pdfplumber.open(path) as pdf:
|
95 |
pages = pdf.pages[:5] # 데모: 최대 5쪽
|
96 |
text = "\n".join(p.extract_text() or "" for p in pages)
|
97 |
-
elif ext in [".png",
|
98 |
from PIL import Image
|
99 |
import pytesseract
|
100 |
text = pytesseract.image_to_string(Image.open(path))
|
101 |
else:
|
102 |
return "⚠️ 지원하지 않는 파일 형식입니다.", ""
|
103 |
except Exception as e:
|
104 |
-
return f"❌ 텍스트 추출 실패: {
|
105 |
|
106 |
text = text.strip()
|
107 |
if not text:
|
@@ -110,145 +103,146 @@ def translate_document(file_in, src, tgt):
|
|
110 |
translated = _gpt_translate(text, src, tgt)
|
111 |
return text, translated
|
112 |
|
113 |
-
#
|
114 |
-
|
115 |
-
|
116 |
-
def stream_generator(mic_stream, src, tgt="English"):
|
117 |
-
buffer = io.BytesIO()
|
118 |
-
wav_header = None
|
119 |
-
orig_acc, trans_acc = "", ""
|
120 |
|
|
|
|
|
|
|
121 |
while True:
|
122 |
chunk = mic_stream.recv()
|
123 |
-
if chunk is None:
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
wav_bytes = wav_header + buffer.getvalue()
|
131 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
132 |
-
tmp.write(wav_bytes)
|
133 |
-
tmp.close()
|
134 |
o, t, _ = translate_audio(tmp.name, src, tgt)
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
143 |
-
tmp.write(wav_bytes)
|
144 |
-
tmp.close()
|
145 |
o, t, _ = translate_audio(tmp.name, src, tgt)
|
146 |
-
yield (
|
147 |
-
|
148 |
-
# ============ ④ 4개국 동시 번역 ====================================
|
149 |
-
FOUR_LANGS = ["English", "Chinese", "Thai", "Russian"]
|
150 |
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
|
173 |
with gr.Tabs():
|
174 |
-
# 1)
|
175 |
-
with gr.TabItem("🎙️
|
176 |
-
src1 = gr.Dropdown(LANGUAGES,
|
177 |
-
tgt1 = gr.Dropdown(LANGUAGES,
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
label="🎤 녹음 또는 오디오 파일 업로드"
|
182 |
-
)
|
183 |
btn1 = gr.Button("번역")
|
184 |
stt1 = gr.Textbox(label="원문", lines=5)
|
185 |
tlt1 = gr.Textbox(label="번역", lines=5)
|
186 |
-
out1 = gr.Audio(label="TTS",
|
187 |
-
|
188 |
-
btn1.click(
|
189 |
-
translate_audio,
|
190 |
-
inputs=[mic1, src1, tgt1],
|
191 |
-
outputs=[stt1, tlt1, out1]
|
192 |
-
)
|
193 |
|
194 |
# 2) PDF / 이미지 번역
|
195 |
with gr.TabItem("📄 문서/이미지 번역"):
|
196 |
-
src2 = gr.Dropdown(LANGUAGES,
|
197 |
-
tgt2 = gr.Dropdown(LANGUAGES,
|
198 |
-
file2
|
199 |
-
|
200 |
-
|
201 |
-
)
|
202 |
btn2 = gr.Button("번역")
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
)
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
with gr.Row():
|
240 |
-
org4 = gr.Textbox(label="원문", lines=4)
|
241 |
-
en4 = gr.Textbox(label="English", lines=4)
|
242 |
-
zh4 = gr.Textbox(label="Chinese (简体)", lines=4)
|
243 |
-
th4 = gr.Textbox(label="Thai", lines=4)
|
244 |
-
ru4 = gr.Textbox(label="Russian", lines=4)
|
245 |
-
|
246 |
-
btn4.click(
|
247 |
-
translate_audio_four,
|
248 |
-
inputs=[aud4, src4],
|
249 |
-
outputs=[org4, en4, zh4, th4, ru4]
|
250 |
-
)
|
251 |
-
|
252 |
-
# ============ 실행 ===================================================
|
253 |
if __name__ == "__main__":
|
254 |
-
app.launch(server_name="0.0.0.0",
|
|
|
|
|
|
|
|
2 |
import openai, os, io, tempfile, mimetypes
|
3 |
from dotenv import load_dotenv
|
4 |
|
5 |
+
# ────────────────────────── 공통 초기화 ──────────────────────────
|
6 |
load_dotenv()
|
7 |
api_key = os.getenv("OPENAI_API_KEY")
|
8 |
if not api_key:
|
9 |
raise RuntimeError("OPENAI_API_KEY를 .env 파일에 설정하세요!")
|
10 |
client = openai.OpenAI(api_key=api_key)
|
11 |
|
|
|
12 |
LANGUAGES = [
|
13 |
"Korean", "English", "Japanese", "Chinese",
|
14 |
"Thai", "Russian", "Vietnamese",
|
15 |
"Spanish", "French"
|
16 |
]
|
17 |
LANG_CODE = {
|
18 |
+
"Korean":"ko","English":"en","Japanese":"ja","Chinese":"zh",
|
19 |
+
"Thai":"th","Russian":"ru","Vietnamese":"vi",
|
20 |
+
"Spanish":"es","French":"fr"
|
21 |
}
|
22 |
+
FOUR_LANGS = ["English", "Chinese", "Thai", "Russian"] # 실시간 동시 번역 언어
|
23 |
+
VOICE = {l:("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
|
24 |
for l in LANGUAGES}
|
25 |
|
26 |
+
# ────────────────────────── 유틸 함수 ───────────────────────────
|
27 |
+
def _safe_path(v):
|
28 |
+
if v is None: return None
|
29 |
+
return v.get("name") if isinstance(v, dict) else v
|
30 |
+
|
31 |
+
def _gpt_translate(text, src, tgt):
|
|
|
|
|
|
|
|
|
|
|
32 |
rsp = client.chat.completions.create(
|
33 |
model="gpt-3.5-turbo",
|
34 |
messages=[
|
35 |
+
{"role":"system",
|
36 |
+
"content":f"You are a professional translator. Translate the following {src} text to {tgt}. "
|
37 |
f"Only provide the translated text."},
|
38 |
+
{"role":"user","content":text}
|
39 |
],
|
40 |
+
temperature=0.3, max_tokens=2048
|
41 |
)
|
42 |
return rsp.choices[0].message.content.strip()
|
43 |
|
44 |
+
def _tts(text, lang):
|
|
|
45 |
out = client.audio.speech.create(
|
46 |
model="tts-1",
|
47 |
+
voice=VOICE.get(lang,"alloy"),
|
48 |
input=text[:4096]
|
49 |
)
|
50 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
|
51 |
+
tmp.write(out.content); tmp.close()
|
|
|
52 |
return tmp.name
|
53 |
|
54 |
+
# ────────────────── ① 음성(Mic·File) 단건 번역 ──────────────────
|
55 |
def translate_audio(audio_in, src, tgt):
|
56 |
path = _safe_path(audio_in)
|
57 |
if not path or not os.path.exists(path):
|
58 |
+
return "⚠️ 음성 파일을 녹음/업로드하세요.", "", None
|
59 |
|
60 |
+
with open(path,"rb") as f:
|
61 |
stt = client.audio.transcriptions.create(
|
62 |
model="whisper-1",
|
63 |
file=f,
|
|
|
68 |
return "⚠️ 음성 인식 실패", "", None
|
69 |
|
70 |
translated = _gpt_translate(original, src, tgt)
|
71 |
+
tts_path = _tts(translated, tgt)
|
72 |
return original, translated, tts_path
|
73 |
|
74 |
+
# ────────────────── ② PDF / 이미지 번역 ─────────────────────────
|
75 |
def translate_document(file_in, src, tgt):
|
76 |
path = _safe_path(file_in)
|
77 |
if not path or not os.path.exists(path):
|
78 |
return "⚠️ PDF 또는 이미지를 업로드하세요.", ""
|
79 |
|
|
|
80 |
ext = os.path.splitext(path)[1].lower()
|
81 |
mime = mimetypes.guess_type(path)[0] or ""
|
82 |
+
text = ""
|
83 |
|
84 |
try:
|
85 |
if ext == ".pdf" or "pdf" in mime:
|
|
|
87 |
with pdfplumber.open(path) as pdf:
|
88 |
pages = pdf.pages[:5] # 데모: 최대 5쪽
|
89 |
text = "\n".join(p.extract_text() or "" for p in pages)
|
90 |
+
elif ext in [".png",".jpg",".jpeg",".bmp",".tiff",".gif"] or "image" in mime:
|
91 |
from PIL import Image
|
92 |
import pytesseract
|
93 |
text = pytesseract.image_to_string(Image.open(path))
|
94 |
else:
|
95 |
return "⚠️ 지원하지 않는 파일 형식입니다.", ""
|
96 |
except Exception as e:
|
97 |
+
return f"❌ 텍스트 추출 실패: {e}", ""
|
98 |
|
99 |
text = text.strip()
|
100 |
if not text:
|
|
|
103 |
translated = _gpt_translate(text, src, tgt)
|
104 |
return text, translated
|
105 |
|
106 |
+
# ──────────────── ③ 실시간 1개 언어 번역 (옵션) ─────────────────
|
107 |
+
STREAM_SEC = 4 # Whisper 호출 주기
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
+
def stream_single(mic_stream, src, tgt):
|
110 |
+
buf, header = io.BytesIO(), None
|
111 |
+
o_acc, t_acc = "", ""
|
112 |
while True:
|
113 |
chunk = mic_stream.recv()
|
114 |
+
if chunk is None: break
|
115 |
+
if header is None: header = chunk[:44]
|
116 |
+
buf.write(chunk)
|
117 |
+
if buf.getbuffer().nbytes > 16000*2*STREAM_SEC:
|
118 |
+
wav = header + buf.getvalue()
|
119 |
+
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp:
|
120 |
+
tmp.write(wav); tmp.close()
|
|
|
|
|
|
|
|
|
121 |
o, t, _ = translate_audio(tmp.name, src, tgt)
|
122 |
+
o_acc += " " + o; t_acc += " " + t
|
123 |
+
yield o_acc.strip(), t_acc.strip()
|
124 |
+
buf = io.BytesIO()
|
125 |
+
if buf.getbuffer().nbytes:
|
126 |
+
wav = header + buf.getvalue()
|
127 |
+
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp:
|
128 |
+
tmp.write(wav); tmp.close()
|
|
|
|
|
|
|
129 |
o, t, _ = translate_audio(tmp.name, src, tgt)
|
130 |
+
yield (o_acc+" "+o).strip(), (t_acc+" "+t).strip()
|
|
|
|
|
|
|
131 |
|
132 |
+
# ─────────────── ④ 실시간 4개 언어 동시 번역 ────────────────────
|
133 |
+
def stream_multi(mic_stream, src):
|
134 |
+
buf, header = io.BytesIO(), None
|
135 |
+
acc = {lang: "" for lang in ["original"] + FOUR_LANGS}
|
136 |
|
137 |
+
while True:
|
138 |
+
chunk = mic_stream.recv()
|
139 |
+
if chunk is None: break
|
140 |
+
if header is None: header = chunk[:44]
|
141 |
+
buf.write(chunk)
|
142 |
+
|
143 |
+
if buf.getbuffer().nbytes > 16000*2*STREAM_SEC:
|
144 |
+
wav = header + buf.getvalue()
|
145 |
+
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp:
|
146 |
+
tmp.write(wav); tmp.close()
|
147 |
+
with open(tmp.name,"rb") as f:
|
148 |
+
stt = client.audio.transcriptions.create(
|
149 |
+
model="whisper-1", file=f,
|
150 |
+
language=LANG_CODE.get(src)
|
151 |
+
)
|
152 |
+
orig = stt.text.strip()
|
153 |
+
if orig:
|
154 |
+
acc["original"] += " " + orig
|
155 |
+
for lang in FOUR_LANGS:
|
156 |
+
acc[lang] += " " + _gpt_translate(orig, src, lang)
|
157 |
+
yield (acc["original"].strip(),
|
158 |
+
acc["English"].strip(),
|
159 |
+
acc["Chinese"].strip(),
|
160 |
+
acc["Thai"].strip(),
|
161 |
+
acc["Russian"].strip())
|
162 |
+
buf = io.BytesIO()
|
163 |
+
|
164 |
+
# 남은 버퍼
|
165 |
+
if buf.getbuffer().nbytes:
|
166 |
+
wav = header + buf.getvalue()
|
167 |
+
with tempfile.NamedTemporaryFile(delete=False,suffix=".wav") as tmp:
|
168 |
+
tmp.write(wav); tmp.close()
|
169 |
+
with open(tmp.name,"rb") as f:
|
170 |
+
stt = client.audio.transcriptions.create(
|
171 |
+
model="whisper-1", file=f,
|
172 |
+
language=LANG_CODE.get(src)
|
173 |
+
)
|
174 |
+
orig = stt.text.strip()
|
175 |
+
if orig:
|
176 |
+
acc["original"] += " " + orig
|
177 |
+
for lang in FOUR_LANGS:
|
178 |
+
acc[lang] += " " + _gpt_translate(orig, src, lang)
|
179 |
+
yield (acc["original"].strip(),
|
180 |
+
acc["English"].strip(),
|
181 |
+
acc["Chinese"].strip(),
|
182 |
+
acc["Thai"].strip(),
|
183 |
+
acc["Russian"].strip())
|
184 |
+
|
185 |
+
# ────────────────────────── Gradio UI ──────────────────────────────
|
186 |
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
|
187 |
with gr.Tabs():
|
188 |
+
# 1) 오디오(녹음·업로드) 번역
|
189 |
+
with gr.TabItem("🎙️ 오디오 번역"):
|
190 |
+
src1 = gr.Dropdown(LANGUAGES,value="Korean",label="입력")
|
191 |
+
tgt1 = gr.Dropdown(LANGUAGES,value="English",label="출력")
|
192 |
+
aud1 = gr.Audio(sources=["microphone","upload"],
|
193 |
+
type="filepath",
|
194 |
+
label="녹음 또는 오디오 파일 업로드")
|
|
|
|
|
195 |
btn1 = gr.Button("번역")
|
196 |
stt1 = gr.Textbox(label="원문", lines=5)
|
197 |
tlt1 = gr.Textbox(label="번역", lines=5)
|
198 |
+
out1 = gr.Audio(label="TTS",type="filepath",autoplay=True)
|
199 |
+
btn1.click(translate_audio, [aud1,src1,tgt1],[stt1,tlt1,out1])
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
# 2) PDF / 이미지 번역
|
202 |
with gr.TabItem("📄 문서/이미지 번역"):
|
203 |
+
src2 = gr.Dropdown(LANGUAGES,value="Korean",label="입력")
|
204 |
+
tgt2 = gr.Dropdown(LANGUAGES,value="English",label="출력")
|
205 |
+
file2= gr.File(label="PDF 또는 이미지 업로드",
|
206 |
+
file_types=[".pdf",".png",".jpg",".jpeg",
|
207 |
+
".bmp",".tiff",".gif"])
|
|
|
208 |
btn2 = gr.Button("번역")
|
209 |
+
org2 = gr.Textbox(label="추출 원문",lines=15)
|
210 |
+
trs2 = gr.Textbox(label="번역 결과",lines=15)
|
211 |
+
btn2.click(translate_document,[file2,src2,tgt2],[org2,trs2])
|
212 |
+
|
213 |
+
# 3) 실시간 1개 언어 번역(선택)
|
214 |
+
with gr.TabItem("⏱️ 실시간 1언어"):
|
215 |
+
src3 = gr.Dropdown(LANGUAGES,value="Korean",label="입력")
|
216 |
+
tgt3 = gr.Dropdown(LANGUAGES,value="English",label="출력")
|
217 |
+
mic3 = gr.Audio(sources=["microphone"],
|
218 |
+
streaming=True,label="실시간 마이크")
|
219 |
+
stt3 = gr.Textbox(label="원문(실시간)",lines=8)
|
220 |
+
tlt3 = gr.Textbox(label="번역(실시간)",lines=8)
|
221 |
+
mic3.stream(lambda a,s,t: stream_single(a,s,t),
|
222 |
+
inputs=[src3,tgt3],
|
223 |
+
outputs=[stt3,tlt3])
|
224 |
+
|
225 |
+
# 4) **실시간 4개 언어 동시 번역** ← 핵심 데모
|
226 |
+
with gr.TabItem("🌏 실시간 4개 언어"):
|
227 |
+
gr.Markdown("마이크 입력을 **English / Chinese(简体) / Thai / Russian** "
|
228 |
+
"4개 언어로 실시간(3-4 초 지연) 동시 번역합니다.")
|
229 |
+
src4 = gr.Dropdown(LANGUAGES,value="Korean",label="입력 언어")
|
230 |
+
mic4 = gr.Audio(sources=["microphone"],
|
231 |
+
streaming=True,label="실시간 마이크")
|
232 |
+
o4 = gr.Textbox(label="원문", lines=8)
|
233 |
+
e4 = gr.Textbox(label="English", lines=8)
|
234 |
+
z4 = gr.Textbox(label="Chinese(简体)", lines=8)
|
235 |
+
t4 = gr.Textbox(label="Thai", lines=8)
|
236 |
+
r4 = gr.Textbox(label="Russian", lines=8)
|
237 |
+
|
238 |
+
# Audio.stream → 5개 출력
|
239 |
+
mic4.stream(lambda a,s: stream_multi(a,s),
|
240 |
+
inputs=[src4],
|
241 |
+
outputs=[o4,e4,z4,t4,r4])
|
242 |
+
|
243 |
+
# ─────────────────────────── 실행 ────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
if __name__ == "__main__":
|
245 |
+
app.launch(server_name="0.0.0.0",
|
246 |
+
server_port=7860,
|
247 |
+
share=False,
|
248 |
+
debug=True)
|