voice-trans / app.py
openfree's picture
Update app.py
32b3c75 verified
raw
history blame
8.31 kB
import gradio as gr
import openai
import os
import tempfile
from dotenv import load_dotenv
# ===== 곡톡 μ΄ˆκΈ°ν™” =========================================
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
print("⚠️ OPENAI_API_KEYλ₯Ό .env νŒŒμΌμ— μ„€μ •ν•˜μ„Έμš”!")
else:
print(f"βœ… API Key λ‘œλ“œλ¨: {api_key[:10]}...")
try:
client = openai.OpenAI(api_key=api_key)
except Exception as e:
print(f"❌ OpenAI ν΄λΌμ΄μ–ΈνŠΈ μ΄ˆκΈ°ν™” μ‹€νŒ¨: {e}")
client = None
# ===== μ–Έμ–΄ μ„€μ • ===========================================
LANGUAGES = [
"Korean", "English", "Japanese", "Chinese", # κΈ°μ‘΄
"Thai", "Russian", "Vietnamese", # μΆ”κ°€
"Spanish", "French" # 선택
]
# Whisper용 ISO-639 μ½”λ“œ λ§€ν•‘
LANG_CODE_MAP = {
"Korean": "ko", "English": "en", "Japanese": "ja", "Chinese": "zh",
"Thai": "th", "Russian": "ru", "Vietnamese": "vi",
"Spanish": "es", "French": "fr"
}
# TTS μŒμ„± λ§€ν•‘(OpenAI tts-1: alloy, nova 두 κ°€μ§€)
VOICE_MAP = {
"Korean": "nova",
"English": "alloy",
"Japanese": "nova",
"Chinese": "nova",
"Thai": "alloy",
"Russian": "alloy",
"Vietnamese": "alloy",
"Spanish": "alloy",
"French": "alloy"
}
# ----------------------------------------------------------
# (1) μŒμ„±(STT) β†’ λ²ˆμ—­ β†’ μŒμ„±(TTS)
# ----------------------------------------------------------
def translate_audio(audio_file, source_lang, target_lang):
if not audio_file:
return "⚠️ μ˜€λ””μ˜€ νŒŒμΌμ„ μ—…λ‘œλ“œν•˜κ±°λ‚˜ λ…ΉμŒν•˜μ„Έμš”.", "", None
if not api_key or not client:
return "❌ API μ΄ˆκΈ°ν™” 였λ₯˜", "", None
if source_lang == target_lang:
return "⚠️ μž…λ ₯ 언어와 좜λ ₯ μ–Έμ–΄κ°€ κ°™μŠ΅λ‹ˆλ‹€.", "", None
try:
# ---------- Whisper STT ----------
lang_code = LANG_CODE_MAP.get(source_lang, None)
with open(audio_file, "rb") as f:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=f,
language=lang_code if lang_code else None # λͺ» 찾으면 μžλ™κ°μ§€
)
original_text = transcript.text.strip()
if not original_text:
return "⚠️ μŒμ„±μ΄ μΈμ‹λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.", "", None
# ---------- GPT λ²ˆμ—­ ----------
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system",
"content": f"You are a professional translator. Translate the following {source_lang} text to {target_lang}. "
f"Only provide the translation without any explanation or additional text."},
{"role": "user", "content": original_text}
],
temperature=0.3,
max_tokens=2000
)
translated_text = response.choices[0].message.content.strip()
# ---------- TTS ----------
tts_response = client.audio.speech.create(
model="tts-1",
voice=VOICE_MAP.get(target_lang, "alloy"),
input=translated_text[:4096]
)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tmp.write(tts_response.content)
output_audio = tmp.name
return original_text, translated_text, output_audio
except Exception as e:
return f"❌ 였λ₯˜: {type(e).__name__}: {str(e)}", "", None
# ----------------------------------------------------------
# (2) PDF / 이미지 β†’ λ²ˆμ—­
# ----------------------------------------------------------
def translate_document(file_obj, source_lang, target_lang):
if not file_obj:
return "⚠️ νŒŒμΌμ„ μ—…λ‘œλ“œν•˜μ„Έμš”.", ""
if not api_key or not client:
return "❌ API μ΄ˆκΈ°ν™” 였λ₯˜", ""
if source_lang == target_lang:
return "⚠️ μž…λ ₯ 언어와 좜λ ₯ μ–Έμ–΄κ°€ κ°™μŠ΅λ‹ˆλ‹€.", ""
ext = os.path.splitext(file_obj.name)[1].lower()
try:
# --- ν…μŠ€νŠΈ μΆ”μΆœ ---
if ext == ".pdf":
import pdfplumber
text_chunks = []
with pdfplumber.open(file_obj.name) as pdf:
for page in pdf.pages[:5]: # 데λͺ¨: μ•ž 5μͺ½λ§Œ
text_chunks.append(page.extract_text() or "")
original_text = "\n".join(text_chunks).strip()
elif ext in [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"]:
from PIL import Image
import pytesseract
original_text = pytesseract.image_to_string(Image.open(file_obj.name))
else:
return "⚠️ μ§€μ›ν•˜μ§€ μ•ŠλŠ” ν˜•μ‹μž…λ‹ˆλ‹€.", ""
if not original_text:
return "⚠️ ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•  수 μ—†μŠ΅λ‹ˆλ‹€.", ""
# --- λ²ˆμ—­ ---
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system",
"content": f"You are a professional translator. Translate the following {source_lang} text to {target_lang}. "
f"Only provide the translation without any explanation or additional text."},
{"role": "user", "content": original_text}
],
temperature=0.3,
max_tokens=4096
)
translated_text = response.choices[0].message.content.strip()
return original_text, translated_text
except Exception as e:
return f"❌ 였λ₯˜: {type(e).__name__}: {str(e)}", ""
# ==========================================================
# Gradio UI
# ==========================================================
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as app:
with gr.Tabs():
# ----- πŸŽ™οΈ μŒμ„± λ²ˆμ—­ -----
with gr.TabItem("πŸŽ™οΈ μŒμ„± λ²ˆμ—­"):
gr.Markdown("""
# πŸŽ™οΈ AI μŒμ„± λ²ˆμ—­κΈ°
마이크둜 λ…ΉμŒν•˜κ±°λ‚˜ μ˜€λ””μ˜€ νŒŒμΌμ„ μ—…λ‘œλ“œν•˜λ©΄ **μ‹€μ‹œκ°„ μžλ§‰ + λ²ˆμ—­ + μŒμ„±ν•©μ„±**κΉŒμ§€ ν•œ λ²ˆμ—!
""")
with gr.Row():
src_lang_a = gr.Dropdown(LANGUAGES, value="Korean", label="μž…λ ₯ μ–Έμ–΄")
tgt_lang_a = gr.Dropdown(LANGUAGES, value="English", label="좜λ ₯ μ–Έμ–΄")
audio_in = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="μŒμ„± μž…λ ₯ (λ…ΉμŒ λ˜λŠ” 파일 μ—…λ‘œλ“œ)"
)
btn_audio = gr.Button("πŸ”„ λ²ˆμ—­ν•˜κΈ°")
with gr.Row():
stt_text = gr.Textbox(label="πŸ“ 원본 ν…μŠ€νŠΈ", lines=5)
tlt_text = gr.Textbox(label="🌐 λ²ˆμ—­λœ ν…μŠ€νŠΈ", lines=5)
audio_out = gr.Audio(label="πŸ”Š λ²ˆμ—­λœ μŒμ„±", type="filepath", autoplay=True)
btn_audio.click(
translate_audio,
inputs=[audio_in, src_lang_a, tgt_lang_a],
outputs=[stt_text, tlt_text, audio_out]
)
# ----- πŸ“„ 자료 λ²ˆμ—­ -----
with gr.TabItem("πŸ“„ 자료 λ²ˆμ—­"):
gr.Markdown("""
# πŸ“„ PDF / 이미지 λ²ˆμ—­ 데λͺ¨
κ΅μœ‘μžλ£ŒΒ·λ°œν‘œμžλ£Œ λ“± **PDF μ΅œλŒ€ 5μͺ½** λ˜λŠ” 이미지 1μž₯을 μ—…λ‘œλ“œν•˜λ©΄ ν…μŠ€νŠΈ μΆ”μΆœ ν›„ λ²ˆμ—­ν•΄μ€λ‹ˆλ‹€.
""")
with gr.Row():
src_lang_d = gr.Dropdown(LANGUAGES, value="Korean", label="μž…λ ₯ μ–Έμ–΄")
tgt_lang_d = gr.Dropdown(LANGUAGES, value="English", label="좜λ ₯ μ–Έμ–΄")
file_in = gr.File(label="PDF / 이미지 μ—…λ‘œλ“œ")
btn_doc = gr.Button("πŸ”„ λ²ˆμ—­ν•˜κΈ°")
original_doc = gr.Textbox(label="πŸ“ μΆ”μΆœλœ 원문", lines=15)
translated_doc = gr.Textbox(label="🌐 λ²ˆμ—­ κ²°κ³Ό", lines=15)
btn_doc.click(
translate_document,
inputs=[file_in, src_lang_d, tgt_lang_d],
outputs=[original_doc, translated_doc]
)
# ==========================================================
if __name__ == "__main__":
print("πŸš€ μ„œλ²„ μ‹œμž‘ 쀑...")
app.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True)