Spaces:

VIDraft
/

voice-trans

Running on Zero

App Files Files Community

openfree commited on Jun 9

Commit

8000eeb

verified ·

1 Parent(s): 4e89e7e

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -39

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import os, asyncio, json, tempfile, websockets, pdfplumber
 import gradio as gr
 import openai
 from dotenv import load_dotenv
 # ─── 0. 초기화 ───────────────────────────────────────────────
 load_dotenv()
@@ -69,10 +71,29 @@ async def process_audio_chunk(audio_data, src_lang):
         return ""
     try:
-        # 임시 파일로 저장
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
-            tmp.write(audio_data)
-            tmp_path = tmp.name
         # Whisper API로 변환
         with open(tmp_path, 'rb') as audio_file:
@@ -92,58 +113,130 @@ async def process_audio_chunk(audio_data, src_lang):
 def realtime_single_sync(audio, src, tgt, state):
     """동기 버전의 실시간 단일 언어 번역"""
     if state is None:
-        state = {"orig": "", "trans": ""}
     if audio is None:
         return state["orig"], state["trans"], state
-    # 비동기 작업을 동기적으로 실행
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    try:
-        # STT
-        text = loop.run_until_complete(process_audio_chunk(audio, src))
-        if text:
-            state["orig"] = state["orig"] + " " + text if state["orig"] else text
-            # 번역
-            trans = loop.run_until_complete(gpt_translate(text, src, tgt))
-            state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
-    finally:
-        loop.close()
     return state["orig"], state["trans"], state
 def realtime_four_sync(audio, src, state):
     """동기 버전의 실시간 4언어 번역"""
     if state is None:
-        state = {"orig": "", "English": "", "Chinese": "", "Thai": "", "Russian": ""}
     if audio is None:
         return (state["orig"], state["English"], state["Chinese"],
                 state["Thai"], state["Russian"], state)
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-    try:
-        # STT
-        text = loop.run_until_complete(process_audio_chunk(audio, src))
-        if text:
-            state["orig"] = state["orig"] + " " + text if state["orig"] else text
-            # 4개 언어로 번역
-            tasks = []
-            for lang in FOUR:
-                tasks.append(gpt_translate(text, src, lang))
-            translations = loop.run_until_complete(asyncio.gather(*tasks))
-            for lang, trans in zip(FOUR, translations):
-                state[lang] = state[lang] + " " + trans if state[lang] else trans
-    finally:
-        loop.close()
     return (state["orig"], state["English"], state["Chinese"],
             state["Thai"], state["Russian"], state)

 import gradio as gr
 import openai
 from dotenv import load_dotenv
+import numpy as np
+import wave
 # ─── 0. 초기화 ───────────────────────────────────────────────
 load_dotenv()
         return ""
     try:
+        # Gradio는 (sample_rate, audio_array) 튜플을 반환
+        if isinstance(audio_data, tuple):
+            sample_rate, audio_array = audio_data
+            # numpy array를 WAV 파일로 변환
+            import numpy as np
+            import wave
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+                with wave.open(tmp.name, 'wb') as wav_file:
+                    wav_file.setnchannels(1)  # mono
+                    wav_file.setsampwidth(2)  # 16-bit
+                    wav_file.setframerate(sample_rate)
+                    # numpy array를 16-bit PCM으로 변환
+                    if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
+                        audio_array = (audio_array * 32767).astype(np.int16)
+                    wav_file.writeframes(audio_array.tobytes())
+                tmp_path = tmp.name
+        else:
+            # bytes 데이터인 경우
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+                tmp.write(audio_data)
+                tmp_path = tmp.name
         # Whisper API로 변환
         with open(tmp_path, 'rb') as audio_file:
 def realtime_single_sync(audio, src, tgt, state):
     """동기 버전의 실시간 단일 언어 번역"""
     if state is None:
+        state = {"orig": "", "trans": "", "audio_buffer": [], "sample_rate": None}
     if audio is None:
+        # 스트림 종료 시 남은 버퍼 처리
+        if state["audio_buffer"] and state["sample_rate"]:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                # 버퍼의 오디오 합치기
+                combined_audio = np.concatenate(state["audio_buffer"])
+                audio_data = (state["sample_rate"], combined_audio)
+                text = loop.run_until_complete(process_audio_chunk(audio_data, src))
+                if text:
+                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
+                    trans = loop.run_until_complete(gpt_translate(text, src, tgt))
+                    state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
+            finally:
+                loop.close()
+            state["audio_buffer"] = []
         return state["orig"], state["trans"], state
+    # 오디오 데이터 버퍼링
+    if isinstance(audio, tuple):
+        sample_rate, audio_array = audio
+        state["sample_rate"] = sample_rate
+        state["audio_buffer"].append(audio_array)
+        # 버퍼가 충분히 쌓였을 때만 처리 (약 1-2초 분량)
+        buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
+        if buffer_duration >= 1.5:  # 1.5초마다 처리
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                # 버퍼의 오디오 합치기
+                combined_audio = np.concatenate(state["audio_buffer"])
+                audio_data = (sample_rate, combined_audio)
+                # STT
+                text = loop.run_until_complete(process_audio_chunk(audio_data, src))
+                if text:
+                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
+                    # 번역
+                    trans = loop.run_until_complete(gpt_translate(text, src, tgt))
+                    state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
+                # 버퍼 초기화
+                state["audio_buffer"] = []
+            finally:
+                loop.close()
     return state["orig"], state["trans"], state
 def realtime_four_sync(audio, src, state):
     """동기 버전의 실시간 4언어 번역"""
     if state is None:
+        state = {"orig": "", "English": "", "Chinese": "", "Thai": "", "Russian": "",
+                 "audio_buffer": [], "sample_rate": None}
     if audio is None:
+        # 스트림 종료 시 남은 버퍼 처리
+        if state["audio_buffer"] and state["sample_rate"]:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                combined_audio = np.concatenate(state["audio_buffer"])
+                audio_data = (state["sample_rate"], combined_audio)
+                text = loop.run_until_complete(process_audio_chunk(audio_data, src))
+                if text:
+                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
+                    tasks = []
+                    for lang in FOUR:
+                        tasks.append(gpt_translate(text, src, lang))
+                    translations = loop.run_until_complete(asyncio.gather(*tasks))
+                    for lang, trans in zip(FOUR, translations):
+                        state[lang] = state[lang] + " " + trans if state[lang] else trans
+            finally:
+                loop.close()
+            state["audio_buffer"] = []
         return (state["orig"], state["English"], state["Chinese"],
                 state["Thai"], state["Russian"], state)
+    # 오디오 데이터 버퍼링
+    if isinstance(audio, tuple):
+        sample_rate, audio_array = audio
+        state["sample_rate"] = sample_rate
+        state["audio_buffer"].append(audio_array)
+        # 버퍼가 충분히 쌓였을 때만 처리
+        buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
+        if buffer_duration >= 1.5:  # 1.5초마다 처리
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                combined_audio = np.concatenate(state["audio_buffer"])
+                audio_data = (sample_rate, combined_audio)
+                # STT
+                text = loop.run_until_complete(process_audio_chunk(audio_data, src))
+                if text:
+                    state["orig"] = state["orig"] + " " + text if state["orig"] else text
+                    # 4개 언어로 번역
+                    tasks = []
+                    for lang in FOUR:
+                        tasks.append(gpt_translate(text, src, lang))
+                    translations = loop.run_until_complete(asyncio.gather(*tasks))
+                    for lang, trans in zip(FOUR, translations):
+                        state[lang] = state[lang] + " " + trans if state[lang] else trans
+                state["audio_buffer"] = []
+            finally:
+                loop.close()
     return (state["orig"], state["English"], state["Chinese"],
             state["Thai"], state["Russian"], state)