Spaces:

VIDraft
/

voice-trans

Running on Zero

App Files Files Community

openfree commited on Jun 9

Commit

0fdb888

verified ·

1 Parent(s): f73d82f

Update app.py

Browse files

Files changed (1) hide show

app.py +232 -130

app.py CHANGED Viewed

@@ -1,155 +1,257 @@
 """
-SMARTok 데모 – 이미지 OCR·실시간 탭 오류 수정본
-───────────────────────────────────────────
-• 이미지 → ocrmypdf (+ghostscript) 우선, 실패 시 pytesseract 직접 OCR
-• 실시간 1·4언어 탭 : State 인자/출력 개수 맞춰 경고 제거
 """
 import gradio as gr
 import openai, os, io, tempfile, mimetypes
 from dotenv import load_dotenv
 from PIL import Image
-import pdfplumber, pytesseract, ocrmypdf, subprocess, shlex
-# ───── 0. Init ────────────────────────────────────────────────────
 load_dotenv()
-client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY", ""))
-LANG = ["Korean","English","Japanese","Chinese",
-        "Thai","Russian","Vietnamese","Spanish","French"]
-LC   = {"Korean":"ko","English":"en","Japanese":"ja","Chinese":"zh",
-        "Thai":"th","Russian":"ru","Vietnamese":"vi","Spanish":"es","French":"fr"}
-VOICE= {l:("nova" if l in ["Korean","Japanese","Chinese"] else "alloy") for l in LANG}
-FOUR = ["English","Chinese","Thai","Russian"]
-CHUNK = 4  # sec
-# ───── 1. Helpers ────────────────────────────────────────────────
-def _safe(v): return None if v is None else (v["name"] if isinstance(v,dict) else v)
-def _gpt(txt, src, tgt):
-    rsp = client.chat.completions.create(
         model="gpt-3.5-turbo",
-        messages=[{"role":"system",
-                   "content":f"Translate {src} → {tgt}. Return only the translation."},
-                  {"role":"user","content":txt}],
-        temperature=0.3,max_tokens=4096)
-    return rsp.choices[0].message.content.strip()
-def _tts(txt, lang):
-    out = client.audio.speech.create(model="tts-1",voice=VOICE.get(lang,"alloy"),
-                                     input=txt[:4096])
-    f = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
-    f.write(out.content); f.close(); return f.name
-# ───── 2. Single Audio translate ─────────────────────────────────
-def trans_audio(inp, src, tgt):
-    p=_safe(inp)
-    if not p or not os.path.exists(p): return "⚠️ 파일 필요","",None
-    with open(p,"rb") as f:
-        stt=client.audio.transcriptions.create(model="whisper-1",file=f,
-                                               language=LC.get(src))
-    orig=stt.text.strip();
-    if not orig: return "⚠️ 인식 실패","",None
-    trans=_gpt(orig,src,tgt)
-    return orig,trans,_tts(trans,tgt)
-# ───── 3. Doc/Image translate ───────────────────────────────────
-def trans_doc(file_in, src, tgt):
-    p=_safe(file_in)
-    if not p or not os.path.exists(p): return "⚠️ 파일 업로드",""
-    ext=os.path.splitext(p)[1].lower()
-    mime=mimetypes.guess_type(p)[0] or ""
     try:
-        if ext==".pdf" or "pdf" in mime:               # PDF
-            with pdfplumber.open(p) as pdf:
-                txt="\n".join(pg.extract_text() or "" for pg in pdf.pages[:5])
-        else:                                          # 이미지
-            tmp_pdf=tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name
-            Image.open(p).save(tmp_pdf,"PDF")
-            ocr_pdf=tempfile.NamedTemporaryFile(delete=False,suffix=".pdf").name
             try:
-                ocrmypdf.ocr(tmp_pdf,ocr_pdf,
-                             lang=LC.get(src,"eng"),deskew=True,optimize=0,
-                             progress_bar=False)
                 with pdfplumber.open(ocr_pdf) as pdf:
-                    txt="\n".join(pg.extract_text() or "" for pg in pdf.pages)
-            except Exception:  # gs 없거나 ocrmypdf 실패 → 직접 OCR
-                txt=pytesseract.image_to_string(Image.open(p), lang=LC.get(src,"eng"))
     except Exception as e:
-        return f"❌ 추출 오류: {e}",""
-    txt=txt.strip()
-    if not txt: return "⚠️ 텍스트 추출 실패",""
-    return txt,_gpt(txt,src,tgt)
-# ───── 4. Real-time single lang ─────────────────────────────────
-def stream_one(path, src, tgt, state):
-    state=state or {"o":"","t":""}
-    if not path or not os.path.exists(path): return state["o"],state["t"],state
-    with open(path,"rb") as f:
-        stt=client.audio.transcriptions.create(model="whisper-1",file=f,
-                                               language=LC.get(src))
-    full=stt.text.strip(); new=full[len(state["o"]):]
     if new:
-        state["o"]=full
-        state["t"]+=" "+_gpt(new,src,tgt)
-    return state["o"],state["t"].strip(),state
-# ───── 5. Real-time 4 langs ────────────────────────────────────
-def stream_four(path, src, state):
-    state=state or {k:"" for k in ["o"]+FOUR}
-    if not path or not os.path.exists(path):
-        return state["o"],state["English"],state["Chinese"],state["Thai"],state["Russian"],state
-    with open(path,"rb") as f:
-        stt=client.audio.transcriptions.create(model="whisper-1",file=f,
-                                               language=LC.get(src))
-    full=stt.text.strip(); new=full[len(state["o"]):]
     if new:
-        state["o"]=full
-        for l in FOUR:
-            state[l]+=" "+_gpt(new,src,l)
-    return (state["o"].strip(),state["English"].strip(),state["Chinese"].strip(),
-            state["Thai"].strip(),state["Russian"].strip(),state)
-# ───── 6. UI ───────────────────────────────────────────────────
-with gr.Blocks(title="SMARTok Demo",theme=gr.themes.Soft()) as app:
     with gr.Tabs():
-        # 탭1
         with gr.TabItem("🎙️ 오디오 번역"):
-            s1=gr.Dropdown(LANG,value="Korean",label="입력")
-            t1=gr.Dropdown(LANG,value="English",label="출력")
-            a1=gr.Audio(sources=["microphone","upload"],type="filepath")
-            btn1=gr.Button("번역")
-            o1=gr.Textbox(label="원문",lines=5); tr1=gr.Textbox(label="번역",lines=5)
-            aud1=gr.Audio(label="TTS",type="filepath",autoplay=True)
-            btn1.click(trans_audio,[a1,s1,t1],[o1,tr1,aud1])
-        # 탭2
         with gr.TabItem("📄 문서·이미지 번역"):
-            s2=gr.Dropdown(LANG,value="Korean",label="입력")
-            t2=gr.Dropdown(LANG,value="English",label="출력")
-            f2=gr.File(file_types=[".pdf",".png",".jpg",".jpeg",".bmp",".tiff",".gif"])
-            btn2=gr.Button("번역")
-            o2=gr.Textbox(label="추출 원문",lines=15); tr2=gr.Textbox(label="번역 결과",lines=15)
-            btn2.click(trans_doc,[f2,s2,t2],[o2,tr2])
-        # 탭3
         with gr.TabItem("⏱️ 실시간 1언어"):
-            s3=gr.Dropdown(LANG,value="Korean",label="입력"); t3=gr.Dropdown(LANG,value="English",label="출력")
-            mic3=gr.Audio(sources=["microphone"],streaming=True)
-            o3=tr3=gr.Textbox(lines=8,label="원문 / 번역")
-            st3=gr.State()
-            mic3.stream(stream_one,inputs=[s3,t3,st3],outputs=[o3,tr3,st3])
-        # 탭4
         with gr.TabItem("🌏 실시간 4언어"):
-            s4=gr.Dropdown(LANG,value="Korean",label="입력 언어")
-            mic4=gr.Audio(sources=["microphone"],streaming=True)
-            o4=gr.Textbox(label="원문",lines=8); e4=gr.Textbox(label="English",lines=8)
-            c4=gr.Textbox(label="Chinese(简体)",lines=8); th4=gr.Textbox(label="Thai",lines=8); r4=gr.Textbox(label="Russian",lines=8)
-            st4=gr.State()
-            mic4.stream(stream_four,inputs=[s4,st4],
-                        outputs=[o4,e4,c4,th4,r4,st4])
-# ───── 7. Run ──────────────────────────────────────────────────
-if __name__=="__main__":
-    app.launch(server_name="0.0.0.0",server_port=7860,share=False,debug=True)

 """
+SMARTok 데모 – 최종 안정판 (2025-06-09)
+────────────────────────────────────────────────────────
+● 탭1  🎙️ 오디오 번역          : 마이크/파일 → 번역 + TTS
+● 탭2  📄 문서‧이미지 번역      : PDF / 이미지(OCR) → 번역
+● 탭3  ⏱️ 실시간 1언어 번역     : 마이크 → 실시간 자막(1개 언어)
+● 탭4  🌏 실시간 4언어 번역     : 마이크 → 영·중·태·러 동시 자막
+────────────────────────────────────────────────────────
+필수 APT 패키지 (packages.txt)
+  tesseract-ocr
+  libtesseract-dev
+  ghostscript
+  tesseract-ocr-kor  tesseract-ocr-eng
+  tesseract-ocr-rus  tesseract-ocr-tha
+  tesseract-ocr-chi-sim
+  ffmpeg
+필수 PIP 패키지 (requirements.txt)
+  gradio>=5.33
+  openai
+  python-dotenv
+  pdfplumber
+  ocrmypdf
+  pytesseract
+  pillow
 """
 import gradio as gr
 import openai, os, io, tempfile, mimetypes
 from dotenv import load_dotenv
 from PIL import Image
+import pdfplumber, ocrmypdf, pytesseract
+# ─────────────────── 0. 초기화 ────────────────────────────────────
 load_dotenv()
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    raise RuntimeError("OPENAI_API_KEY 가 .env 에 없습니다!")
+client = openai.OpenAI(api_key=OPENAI_API_KEY)
+LANGUAGES = [
+    "Korean", "English", "Japanese", "Chinese",
+    "Thai", "Russian", "Vietnamese",
+    "Spanish", "French"
+]
+LANG_CODE = {
+    "Korean": "kor", "English": "eng", "Japanese": "jpn", "Chinese": "chi_sim",
+    "Thai": "tha",  "Russian": "rus", "Vietnamese": "vie",
+    "Spanish": "spa", "French": "fra"
+}
+VOICE = {l: ("nova" if l in ["Korean", "Japanese", "Chinese"] else "alloy")
+         for l in LANGUAGES}
+FOUR = ["English", "Chinese", "Thai", "Russian"]   # 동시 번역 대상
+STREAM_SEC = 4                                     # 실시간 청크 길이
+# ─────────────────── 1. 공통 함수 ────────────────────────────────
+def _safe(val):
+    """Gradio File/Audio → 경로"""
+    if val is None:
+        return None
+    return val["name"] if isinstance(val, dict) else val
+def _gpt_translate(text: str, src: str, tgt: str) -> str:
+    """GPT-3.5 번역"""
+    resp = client.chat.completions.create(
         model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system",
+             "content": f"Translate the following {src} text to {tgt}. "
+                        "Return only the translated text."},
+            {"role": "user", "content": text}
+        ],
+        temperature=0.3,
+        max_tokens=4096
+    )
+    return resp.choices[0].message.content.strip()
+def _tts(text: str, lang: str) -> str:
+    """텍스트를 mp3(TTS-1)로 변환 후 경로 반환"""
+    out = client.audio.speech.create(
+        model="tts-1",
+        voice=VOICE.get(lang, "alloy"),
+        input=text[:4096]
+    )
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
+    tmp.write(out.content)
+    tmp.close()
+    return tmp.name
+# ─────────────────── 2. 오디오(단건) 번역 ─────────────────────────
+def translate_audio(audio_in, src, tgt):
+    path = _safe(audio_in)
+    if not path or not os.path.exists(path):
+        return "⚠️ 음성 파일을 녹음하거나 업로드하세요.", "", None
+    with open(path, "rb") as f:
+        stt = client.audio.transcriptions.create(
+            model="whisper-1", file=f, language=LANG_CODE.get(src, "eng")
+        )
+    origin = stt.text.strip()
+    if not origin:
+        return "⚠️ 음성 인식 실패", "", None
+    translated = _gpt_translate(origin, src, tgt)
+    tts_path = _tts(translated, tgt)
+    return origin, translated, tts_path
+# ─────────────────── 3. 문서/이미지 번역 ─────────────────────────
+def translate_doc(file_in, src, tgt):
+    path = _safe(file_in)
+    if not path or not os.path.exists(path):
+        return "⚠️ PDF 또는 이미지를 업로드하세요.", ""
+    ext = os.path.splitext(path)[1].lower()
+    mime = mimetypes.guess_type(path)[0] or ""
+    text = ""
     try:
+        # (A) PDF 직접 텍스트 추출
+        if ext == ".pdf" or "pdf" in mime:
+            with pdfplumber.open(path) as pdf:
+                text = "\n".join(page.extract_text() or "" for page in pdf.pages[:5])
+        # (B) 이미지 → OCR PDF → 텍스트
+        else:
+            tmp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
+            Image.open(path).save(tmp_pdf, "PDF")
+            ocr_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
             try:
+                # OCR 레이어 삽입 (언어 데이터 없는 경우 실패할 수 있음)
+                ocrmypdf.ocr(
+                    tmp_pdf, ocr_pdf,
+                    lang=f"{LANG_CODE.get(src,'eng')}+eng",
+                    deskew=True, optimize=0, progress_bar=False
+                )
                 with pdfplumber.open(ocr_pdf) as pdf:
+                    text = "\n".join(p.extract_text() or "" for p in pdf.pages)
+            except Exception:
+                # ocrmypdf 실패 → pytesseract 직접
+                text = pytesseract.image_to_string(
+                    Image.open(path), lang=LANG_CODE.get(src, "eng")
+                )
     except Exception as e:
+        return f"❌ 텍스트 추출 오류: {e}", ""
+    text = text.strip()
+    if not text:
+        return "⚠️ 텍스트를 추출하지 못했습니다.", ""
+    return text, _gpt_translate(text, src, tgt)
+# ─────────────────── 4. 실시간 1언어 스트림 ──────────────────────
+def stream_one(audio_path, src, tgt, state):
+    """state = {'orig': str, 'trans': str}"""
+    state = state or {"orig": "", "trans": ""}
+    if not audio_path or not os.path.exists(audio_path):
+        return state["orig"], state["trans"], state
+    with open(audio_path, "rb") as f:
+        stt = client.audio.transcriptions.create(
+            model="whisper-1", file=f, language=LANG_CODE.get(src, "eng")
+        )
+    full = stt.text.strip()
+    new = full[len(state["orig"]):]
     if new:
+        state["orig"] = full
+        state["trans"] += " " + _gpt_translate(new, src, tgt)
+    return state["orig"], state["trans"].strip(), state
+# ─────────────────── 5. 실시간 4언어 스트림 ──────────────────────
+def stream_four(audio_path, src, state):
+    """
+    state keys: orig / English / Chinese / Thai / Russian
+    """
+    state = state or {k: "" for k in ["orig"] + FOUR}
+    if not audio_path or not os.path.exists(audio_path):
+        return (state["orig"], state["English"], state["Chinese"],
+                state["Thai"], state["Russian"], state)
+    with open(audio_path, "rb") as f:
+        stt = client.audio.transcriptions.create(
+            model="whisper-1", file=f, language=LANG_CODE.get(src, "eng")
+        )
+    full = stt.text.strip()
+    new = full[len(state["orig"]):]
     if new:
+        state["orig"] = full
+        for tgt in FOUR:
+            state[tgt] += " " + _gpt_translate(new, src, tgt)
+    return (state["orig"].strip(), state["English"].strip(), state["Chinese"].strip(),
+            state["Thai"].strip(), state["Russian"].strip(), state)
+# ─────────────────── 6. Gradio UI ──────────────────────────────
+with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
     with gr.Tabs():
+        # 탭 1 ─ 오디오 번역
         with gr.TabItem("🎙️ 오디오 번역"):
+            s1 = gr.Dropdown(LANGUAGES, value="Korean", label="입력 언어")
+            t1 = gr.Dropdown(LANGUAGES, value="English", label="출력 언어")
+            aud1 = gr.Audio(sources=["microphone", "upload"], type="filepath")
+            btn1 = gr.Button("번역")
+            o1 = gr.Textbox(label="원문", lines=5)
+            tr1 = gr.Textbox(label="번역", lines=5)
+            a1 = gr.Audio(label="TTS", type="filepath", autoplay=True)
+            btn1.click(translate_audio, [aud1, s1, t1], [o1, tr1, a1])
+        # 탭 2 ─ 문서·이미지 번역
         with gr.TabItem("📄 문서·이미지 번역"):
+            s2 = gr.Dropdown(LANGUAGES, value="Korean", label="입력 언어")
+            t2 = gr.Dropdown(LANGUAGES, value="English", label="출력 언어")
+            file2 = gr.File(label="PDF / 이미지 업로드",
+                            file_types=[".pdf", ".png", ".jpg", ".jpeg",
+                                        ".bmp", ".tiff", ".gif"])
+            btn2 = gr.Button("번역")
+            o2 = gr.Textbox(label="추출 원문", lines=15)
+            tr2 = gr.Textbox(label="번역 결과", lines=15)
+            btn2.click(translate_doc, [file2, s2, t2], [o2, tr2])
+        # 탭 3 ─ 실시간 1언어 번역
         with gr.TabItem("⏱️ 실시간 1언어"):
+            s3 = gr.Dropdown(LANGUAGES, value="Korean", label="입력 언어")
+            t3 = gr.Dropdown(LANGUAGES, value="English", label="출력 언어")
+            mic3 = gr.Audio(sources=["microphone"], streaming=True)
+            o3 = gr.Textbox(label="원문(실시간)", lines=8)
+            tr3 = gr.Textbox(label="번역(실시간)", lines=8)
+            st3 = gr.State()
+            mic3.stream(stream_one, inputs=[s3, t3, st3],
+                        outputs=[o3, tr3, st3])
+        # 탭 4 ─ 실시간 4언어 번역
         with gr.TabItem("🌏 실시간 4언어"):
+            s4 = gr.Dropdown(LANGUAGES, value="Korean", label="입력 언어")
+            mic4 = gr.Audio(sources=["microphone"], streaming=True)
+            o4 = gr.Textbox(label="원문", lines=8)
+            e4 = gr.Textbox(label="English", lines=8)
+            c4 = gr.Textbox(label="Chinese(简体)", lines=8)
+            th4 = gr.Textbox(label="Thai", lines=8)
+            r4 = gr.Textbox(label="Russian", lines=8)
+            st4 = gr.State()
+            mic4.stream(stream_four, inputs=[s4, st4],
+                        outputs=[o4, e4, c4, th4, r4, st4])
+# ─────────────────── 7. 실행 ───────────────────────────────────
+if __name__ == "__main__":
+    app.launch(server_name="0.0.0.0",
+               server_port=7860,
+               share=False,
+               debug=True)