Spaces:

VIDraft
/

voice-trans

Running on Zero

File size: 24,608 Bytes

# SMARTok Demo - 실시간 다국어 번역 시스템
# 
# 필수 패키지:
# pip install gradio openai python-dotenv pdfplumber numpy websockets
# 
# 선택 패키지 (비디오 처리):
# - ffmpeg 설치: sudo apt-get install ffmpeg (Linux) / brew install ffmpeg (Mac)
# - 또는 pip install moviepy
#
# 환경 변수:
# .env 파일에 OPENAI_API_KEY 설정 필요

import os, asyncio, json, tempfile, websockets, pdfplumber
import gradio as gr
import openai
from dotenv import load_dotenv
import numpy as np
import wave
import subprocess
import mimetypes

# ─── 0. 초기화 ───────────────────────────────────────────────
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    raise RuntimeError("OPENAI_API_KEY 가 .env 에 없습니다!")

# ffmpeg 설치 확인
def check_ffmpeg():
    try:
        subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
        return True
    except:
        return False

HAS_FFMPEG = check_ffmpeg()
if not HAS_FFMPEG:
    print("⚠️ ffmpeg가 설치되어 있지 않습니다. 비디오 처리가 제한될 수 있습니다.")
    print("설치 방법: sudo apt-get install ffmpeg (Linux) / brew install ffmpeg (Mac)")

LANG = ["Korean","English","Japanese","Chinese",
        "Thai","Russian","Vietnamese","Spanish","French"]
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
         for l in LANG}
FOUR = ["English","Chinese","Thai","Russian"]
WS_URL = "wss://api.openai.com/v1/realtime"  # 올바른 엔드포인트로 수정

# ─── 1. 공통 GPT 번역 / TTS ─────────────────────────────────
# 전역 클라이언트 관리
client = None

def get_client():
    global client
    if client is None:
        client = openai.AsyncClient()
    return client

async def gpt_translate(text, src, tgt):
    try:
        client = get_client()
        rsp = await client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role":"system",
                       "content":f"Translate {src} → {tgt}. Return only the text."},
                      {"role":"user","content":text}],
            temperature=0.3,max_tokens=2048)
        return rsp.choices[0].message.content.strip()
    except Exception as e:
        print(f"번역 오류: {e}")
        return ""

async def gpt_tts(text, lang):
    try:
        client = get_client()
        rsp = await client.audio.speech.create(
            model="tts-1", voice=VOICE[lang], input=text[:4096])
        tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
        tmp.write(rsp.content); tmp.close(); return tmp.name
    except Exception as e:
        print(f"TTS 오류: {e}")
        return None

# ─── 2. PDF 번역 ────────────────────────────────────────────
def translate_pdf(file, src, tgt):
    if not file: return "⚠️ PDF 업로드 필요", ""
    with pdfplumber.open(file.name) as pdf:
        text = "\n".join(p.extract_text() or "" for p in pdf.pages[:5]).strip()
    if not text:
        return "⚠️ 텍스트 추출 실패", ""
    return text, asyncio.run(gpt_translate(text, src, tgt))

# ─── 2-1. 오디오 번역 (탭1용) ────────────────────────────────
def extract_audio_from_video(video_path):
    """MP4 등 비디오 파일에서 오디오 추출"""
    audio_output = None
    try:
        # 임시 오디오 파일 생성
        audio_output = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        audio_output.close()
        
        # 방법 1: ffmpeg 사용 시도
        if HAS_FFMPEG:
            cmd = [
                'ffmpeg',
                '-i', video_path,
                '-vn',  # 비디오 스트림 제거
                '-acodec', 'pcm_s16le',  # WAV 포맷
                '-ar', '16000',  # 16kHz 샘플링
                '-ac', '1',  # 모노
                '-y',  # 덮어쓰기
                audio_output.name
            ]
            
            result = subprocess.run(cmd, capture_output=True, text=True)
            
            if result.returncode == 0:
                return audio_output.name
            else:
                print(f"ffmpeg 오류: {result.stderr}")
        
        # 방법 2: moviepy 사용 시도
        try:
            from moviepy.editor import VideoFileClip
            print("moviepy를 사용하여 오디오 추출 중...")
            video = VideoFileClip(video_path)
            video.audio.write_audiofile(
                audio_output.name, 
                fps=16000, 
                nbytes=2,
                codec='pcm_s16le',
                verbose=False, 
                logger=None
            )
            video.close()
            return audio_output.name
        except ImportError:
            raise Exception(
                "비디오 처리를 위해 ffmpeg 또는 moviepy가 필요합니다.\n"
                "설치: pip install moviepy 또는 ffmpeg 설치"
            )
        except Exception as e:
            raise Exception(f"moviepy 오류: {str(e)}")
            
    except Exception as e:
        # 오류 시 임시 파일 정리
        if audio_output and os.path.exists(audio_output.name):
            os.unlink(audio_output.name)
        raise e

async def translate_audio_async(file, src, tgt):
    if not file: return "⚠️ 오디오/비디오 업로드 필요", "", None
    
    try:
        # 파일 타입 확인
        mime_type, _ = mimetypes.guess_type(file)
        audio_file_path = file
        temp_audio_path = None
        
        # 비디오 파일인 경우 오디오 추출
        if mime_type and mime_type.startswith('video/'):
            print(f"비디오 파일 감지: {mime_type}")
            print(f"파일 크기: {os.path.getsize(file) / 1024 / 1024:.1f} MB")
            print("비디오에서 오디오 추출 중... (시간이 걸릴 수 있습니다)")
            temp_audio_path = extract_audio_from_video(file)
            audio_file_path = temp_audio_path
            print("오디오 추출 완료!")
        
        # STT: Whisper API 사용
        print("음성 인식 중...")
        client = get_client()
        with open(audio_file_path, 'rb') as audio_file:
            transcript = await client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                language=src[:2].lower()  # 언어 코드 간소화
            )
        
        # 임시 파일 정리
        if temp_audio_path and os.path.exists(temp_audio_path):
            os.unlink(temp_audio_path)
        
        orig_text = transcript.text
        if not orig_text.strip():
            return "⚠️ 음성이 감지되지 않았습니다", "", None
        
        print(f"인식된 텍스트: {orig_text[:50]}...")
        
        # 번역
        print(f"{src} → {tgt} 번역 중...")
        trans_text = await gpt_translate(orig_text, src, tgt)
        
        # TTS
        print("음성 합성 중...")
        audio_path = await gpt_tts(trans_text, tgt)
        
        return orig_text, trans_text, audio_path
    except Exception as e:
        print(f"오디오 번역 오류: {e}")
        # 임시 파일 정리
        if 'temp_audio_path' in locals() and temp_audio_path and os.path.exists(temp_audio_path):
            os.unlink(temp_audio_path)
        
        error_msg = str(e)
        if "ffmpeg" in error_msg.lower():
            error_msg += "\n\n💡 해결 방법:\n1. ffmpeg 설치: sudo apt-get install ffmpeg\n2. 또는 pip install moviepy"
        
        return "⚠️ 번역 중 오류 발생", error_msg, None

def translate_audio(file, src, tgt):
    return asyncio.run(translate_audio_async(file, src, tgt))

# ─── 3. 실시간 STT (Whisper API 사용) ──────────────────────────
async def process_audio_chunk(audio_data, src_lang):
    """오디오 청크를 처리하여 텍스트로 변환"""
    if audio_data is None:
        return ""
    
    try:
        # Gradio는 (sample_rate, audio_array) 튜플을 반환
        if isinstance(audio_data, tuple):
            sample_rate, audio_array = audio_data
            
            # 오디오가 너무 짧으면 무시 (0.5초 미만)
            if len(audio_array) < sample_rate * 0.5:
                return ""
            
            # 오디오 정규화 및 노이즈 필터링
            audio_array = audio_array.astype(np.float32)
            
            # 무음 감지 - RMS가 너무 낮으면 무시
            rms = np.sqrt(np.mean(audio_array**2))
            if rms < 0.01:  # 무음 임계값
                return ""
            
            # 정규화
            max_val = np.max(np.abs(audio_array))
            if max_val > 0:
                audio_array = audio_array / max_val * 0.95
            
            # numpy array를 WAV 파일로 변환
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
                with wave.open(tmp.name, 'wb') as wav_file:
                    wav_file.setnchannels(1)  # mono
                    wav_file.setsampwidth(2)  # 16-bit
                    wav_file.setframerate(sample_rate)
                    
                    # float32를 16-bit PCM으로 변환
                    audio_int16 = (audio_array * 32767).astype(np.int16)
                    wav_file.writeframes(audio_int16.tobytes())
                tmp_path = tmp.name
        else:
            # bytes 데이터인 경우
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
                tmp.write(audio_data)
                tmp_path = tmp.name
        
        # Whisper API로 변환 - 언어 힌트와 프롬프트 추가
        with open(tmp_path, 'rb') as audio_file:
            # 언어별 프롬프트 설정으로 hallucination 방지
            language_prompts = {
                "Korean": "이것은 한국어 대화입니다.",
                "English": "This is an English conversation.",
                "Japanese": "これは日本語の会話です。",
                "Chinese": "这是中文对话。",
            }
            
            prompt = language_prompts.get(src_lang, "")
            
            client = get_client()
            transcript = await client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                language=src_lang[:2].lower(),
                prompt=prompt,
                temperature=0.0  # 더 보수적인 추론
            )
        
        os.unlink(tmp_path)  # 임시 파일 삭제
        
        # 결과 후처리 - 반복되는 패턴 제거
        text = transcript.text.strip()
        
        # 같은 문장이 반복되는 경우 처리
        sentences = text.split('.')
        if len(sentences) > 1:
            unique_sentences = []
            for sent in sentences:
                sent = sent.strip()
                if sent and (not unique_sentences or sent != unique_sentences[-1]):
                    unique_sentences.append(sent)
            text = '. '.join(unique_sentences)
            if text and not text.endswith('.'):
                text += '.'
        
        # 뉴스 관련 hallucination 패턴 감지 및 제거
        hallucination_patterns = [
            "MBC 뉴스", "KBS 뉴스", "SBS 뉴스", "JTBC 뉴스", 
            "뉴스룸", "뉴스데스크", "앵커", "기자입니다"
        ]
        
        # 짧은 텍스트에서 뉴스 패턴이 감지되면 무시
        if len(text) < 50 and any(pattern in text for pattern in hallucination_patterns):
            return ""
        
        return text
        
    except Exception as e:
        print(f"STT 오류: {e}")
        return ""

# ─── 4. Gradio 스트림 핸들러 (동기 버전) ─────────────────────
def realtime_single_sync(audio, src, tgt, state):
    """동기 버전의 실시간 단일 언어 번역"""
    if state is None:
        state = {"orig": "", "trans": "", "audio_buffer": [], "sample_rate": None}
    
    if audio is None:
        # 스트림 종료 시 남은 버퍼 처리
        if state["audio_buffer"] and state["sample_rate"]:
            try:
                # 버퍼의 오디오 합치기
                combined_audio = np.concatenate(state["audio_buffer"])
                audio_data = (state["sample_rate"], combined_audio)
                
                # 비동기 작업 실행
                text = asyncio.run(process_audio_chunk(audio_data, src))
                if text:
                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
                    trans = asyncio.run(gpt_translate(text, src, tgt))
                    state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
            except Exception as e:
                print(f"처리 오류: {e}")
            state["audio_buffer"] = []
        
        return state["orig"], state["trans"], state
    
    # 오디오 데이터 버퍼링
    if isinstance(audio, tuple):
        sample_rate, audio_array = audio
        state["sample_rate"] = sample_rate
        state["audio_buffer"].append(audio_array)
        
        # 버퍼가 충분히 쌓였을 때만 처리 (약 2-3초 분량)
        if state["audio_buffer"]:  # 버퍼가 비어있지 않은지 확인
            buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
            if buffer_duration >= 2.0:  # 2초마다 처리
            try:
                # 버퍼의 오디오 합치기
                combined_audio = np.concatenate(state["audio_buffer"])
                audio_data = (sample_rate, combined_audio)
                
                # STT
                text = asyncio.run(process_audio_chunk(audio_data, src))
                if text:
                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
                    
                    # 번역
                    trans = asyncio.run(gpt_translate(text, src, tgt))
                    state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
                
                # 버퍼 초기화
                state["audio_buffer"] = []
            except Exception as e:
                print(f"처리 오류: {e}")
                state["audio_buffer"] = []  # 오류 시에도 버퍼 초기화
    
    return state["orig"], state["trans"], state

def realtime_four_sync(audio, src, state):
    """동기 버전의 실시간 4언어 번역"""
    if state is None:
        state = {"orig": "", "English": "", "Chinese": "", "Thai": "", "Russian": "", 
                 "audio_buffer": [], "sample_rate": None}
    
    if audio is None:
        # 스트림 종료 시 남은 버퍼 처리
        if state["audio_buffer"] and state["sample_rate"]:
            try:
                combined_audio = np.concatenate(state["audio_buffer"])
                audio_data = (state["sample_rate"], combined_audio)
                
                text = asyncio.run(process_audio_chunk(audio_data, src))
                if text:
                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
                    
                    # 순차적으로 번역 (병렬 처리 시 문제 발생 가능)
                    for lang in FOUR:
                        trans = asyncio.run(gpt_translate(text, src, lang))
                        state[lang] = state[lang] + " " + trans if state[lang] else trans
            except Exception as e:
                print(f"처리 오류: {e}")
            state["audio_buffer"] = []
        
        return (state["orig"], state["English"], state["Chinese"], 
                state["Thai"], state["Russian"], state)
    
    # 오디오 데이터 버퍼링
    if isinstance(audio, tuple):
        sample_rate, audio_array = audio
        state["sample_rate"] = sample_rate
        state["audio_buffer"].append(audio_array)
        
        # 버퍼가 충분히 쌓였을 때만 처리
        if state["audio_buffer"]:  # 버퍼가 비어있지 않은지 확인
            buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
            if buffer_duration >= 2.0:  # 2초마다 처리
            try:
                combined_audio = np.concatenate(state["audio_buffer"])
                audio_data = (sample_rate, combined_audio)
                
                # STT
                text = asyncio.run(process_audio_chunk(audio_data, src))
                if text:
                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
                    
                    # 4개 언어로 순차 번역
                    for lang in FOUR:
                        trans = asyncio.run(gpt_translate(text, src, lang))
                        state[lang] = state[lang] + " " + trans if state[lang] else trans
                
                state["audio_buffer"] = []
            except Exception as e:
                print(f"처리 오류: {e}")
                state["audio_buffer"] = []
    
    return (state["orig"], state["English"], state["Chinese"], 
            state["Thai"], state["Russian"], state)

# ─── 5. UI ──────────────────────────────────────────────────
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🌍 SMARTok 실시간 번역 시스템
        
        다국어 실시간 번역을 지원하는 통합 번역 플랫폼
        """
    )
    
    with gr.Tabs():
        # 탭 1 – 오디오 번역
        with gr.TabItem("🎙️ 오디오/비디오"):
            gr.Markdown("### 🌐 오디오/비디오 파일 번역")
            
            with gr.Row():
                src1 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
                tgt1 = gr.Dropdown(LANG, value="English", label="출력 언어")
            
            with gr.Tabs():
                with gr.TabItem("📁 파일 업로드"):
                    # 파일 업로드 - 오디오와 비디오 모두 지원
                    aud1_file = gr.File(
                        label="오디오/비디오 파일 업로드",
                        file_types=[".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus", 
                                   ".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv"],
                        type="filepath"
                    )
                    gr.Markdown(
                        "📌 **지원 형식**\n"
                        "- 오디오: MP3, WAV, M4A, FLAC, OGG, OPUS\n"
                        "- 비디오: MP4, AVI, MOV, MKV, WebM, FLV\n\n"
                        "⚠️ **주의사항**\n"
                        "- 비디오 파일은 오디오 추출 시간이 필요합니다\n"
                        "- 대용량 파일은 처리 시간이 오래 걸릴 수 있습니다"
                    )
                
                with gr.TabItem("🎤 마이크 녹음"):
                    aud1_mic = gr.Audio(
                        sources=["microphone"],
                        type="filepath",
                        label="마이크 녹음"
                    )
                    gr.Markdown("💡 **팁**: 녹음 후 '정지' 버튼을 눌러주세요")
            
            btn1 = gr.Button("🔄 번역 시작", variant="primary", size="lg")
            
            # 진행 상태 표시
            status1 = gr.Textbox(label="진행 상태", value="대기 중...", interactive=False)
            
            with gr.Row():
                with gr.Column():
                    o1 = gr.Textbox(label="📝 원문", lines=6)
                with gr.Column():
                    t1 = gr.Textbox(label="📝 번역", lines=6)
            
            a1 = gr.Audio(label="🔊 번역된 음성 (TTS)", type="filepath", autoplay=True)
            
            # 파일이나 마이크 중 활성화된 입력 사용
            def translate_with_status(file_input, mic_input, src, tgt):
                active_input = file_input if file_input else mic_input
                if not active_input:
                    return "⚠️ 파일을 업로드하거나 녹음을 해주세요", "", None
                
                # 상태 업데이트는 동기 함수에서 처리
                return translate_audio(active_input, src, tgt)
            
            btn1.click(
                lambda: "처리 중... 잠시만 기다려주세요 ⏳",
                outputs=status1
            ).then(
                translate_with_status,
                [aud1_file, aud1_mic, src1, tgt1],
                [o1, t1, a1]
            ).then(
                lambda: "✅ 완료!",
                outputs=status1
            )

        # 탭 2 – PDF 번역
        with gr.TabItem("📄 PDF"):
            src2 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
            tgt2 = gr.Dropdown(LANG, value="English", label="출력 언어")
            pdf = gr.File(file_types=[".pdf"])
            btn2 = gr.Button("번역")
            o2 = gr.Textbox(label="추출 원문", lines=15)
            t2 = gr.Textbox(label="번역 결과", lines=15)
            
            btn2.click(translate_pdf, [pdf, src2, tgt2], [o2, t2])

        # 탭 3 – 실시간 1언어
        with gr.TabItem("⏱️ 실시간 1"):
            src3 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
            tgt3 = gr.Dropdown(LANG, value="English", label="출력 언어")
            
            with gr.Row():
                with gr.Column():
                    gr.Markdown("🎤 **마이크 입력**")
                    mic3 = gr.Audio(
                        sources=["microphone"], 
                        streaming=True,
                        type="numpy",  # numpy 형식 명시
                        label="마이크"
                    )
                    gr.Markdown("💡 **사용 방법**\n- 2-3초 정도 문장을 말씀해주세요\n- 너무 짧거나 긴 문장은 인식이 어려울 수 있습니다")
                
                with gr.Column():
                    o3 = gr.Textbox(label="원문(실시간)", lines=8, interactive=False)
                    t3 = gr.Textbox(label="번역(실시간)", lines=8, interactive=False)
            
            st3 = gr.State()
            
            # stream 메서드 수정
            mic3.stream(
                realtime_single_sync,
                inputs=[mic3, src3, tgt3, st3],
                outputs=[o3, t3, st3],
                stream_every=0.5  # 0.5초마다 스트림 (time_limit 제거)
            )

        # 탭 4 – 실시간 4언어
        with gr.TabItem("🌏 실시간 4"):
            src4 = gr.Dropdown(LANG, value="Korean", label="입력 언어")
            
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("🎤 **마이크 입력**")
                    mic4 = gr.Audio(
                        sources=["microphone"], 
                        streaming=True,
                        type="numpy",
                        label="마이크"
                    )
                    o4 = gr.Textbox(label="원문", lines=8, interactive=False)
                
                with gr.Column(scale=2):
                    with gr.Row():
                        e4 = gr.Textbox(label="English", lines=8, interactive=False)
                        c4 = gr.Textbox(label="Chinese(简体)", lines=8, interactive=False)
                    with gr.Row():
                        th4 = gr.Textbox(label="Thai", lines=8, interactive=False)
                        r4 = gr.Textbox(label="Russian", lines=8, interactive=False)
            
            st4 = gr.State()
            
            # stream 메서드 수정
            mic4.stream(
                realtime_four_sync,
                inputs=[mic4, src4, st4],
                outputs=[o4, e4, c4, th4, r4, st4],
                stream_every=0.5
            )

demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)