Spaces:

VIDraft
/

voice-trans

Running on Zero

App Files Files Community

openfree commited on Jun 9

Commit

2b6f990

verified ·

1 Parent(s): f1697bd

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -69

app.py CHANGED Viewed

@@ -19,20 +19,39 @@ FOUR = ["English","Chinese","Thai","Russian"]
 WS_URL = "wss://api.openai.com/v1/realtime"  # 올바른 엔드포인트로 수정
 # ─── 1. 공통 GPT 번역 / TTS ─────────────────────────────────
 async def gpt_translate(text, src, tgt):
-    rsp = await openai.AsyncClient().chat.completions.create(
-        model="gpt-3.5-turbo",
-        messages=[{"role":"system",
-                   "content":f"Translate {src} → {tgt}. Return only the text."},
-                  {"role":"user","content":text}],
-        temperature=0.3,max_tokens=2048)
-    return rsp.choices[0].message.content.strip()
 async def gpt_tts(text, lang):
-    rsp = await openai.AsyncClient().audio.speech.create(
-        model="tts-1", voice=VOICE[lang], input=text[:4096])
-    tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
-    tmp.write(rsp.content); tmp.close(); return tmp.name
 # ─── 2. PDF 번역 ────────────────────────────────────────────
 def translate_pdf(file, src, tgt):
@@ -47,19 +66,24 @@ def translate_pdf(file, src, tgt):
 async def translate_audio_async(file, src, tgt):
     if not file: return "⚠️ 오디오 업로드 필요", "", None
-    # STT: Whisper API 사용
-    with open(file, 'rb') as audio_file:
-        transcript = await openai.AsyncClient().audio.transcriptions.create(
-            model="whisper-1",
-            file=audio_file,
-            language=src[:2].lower()  # 언어 코드 간소화
-        )
-    orig_text = transcript.text
-    trans_text = await gpt_translate(orig_text, src, tgt)
-    audio_path = await gpt_tts(trans_text, tgt)
-    return orig_text, trans_text, audio_path
 def translate_audio(file, src, tgt):
     return asyncio.run(translate_audio_async(file, src, tgt))
@@ -121,7 +145,8 @@ async def process_audio_chunk(audio_data, src_lang):
             prompt = language_prompts.get(src_lang, "")
-            transcript = await openai.AsyncClient().audio.transcriptions.create(
                 model="whisper-1",
                 file=audio_file,
                 language=src_lang[:2].lower(),
@@ -171,20 +196,19 @@ def realtime_single_sync(audio, src, tgt, state):
     if audio is None:
         # 스트림 종료 시 남은 버퍼 처리
         if state["audio_buffer"] and state["sample_rate"]:
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
             try:
                 # 버퍼의 오디오 합치기
                 combined_audio = np.concatenate(state["audio_buffer"])
                 audio_data = (state["sample_rate"], combined_audio)
-                text = loop.run_until_complete(process_audio_chunk(audio_data, src))
                 if text:
                     state["orig"] = state["orig"] + " " + text if state["orig"] else text
-                    trans = loop.run_until_complete(gpt_translate(text, src, tgt))
                     state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
-            finally:
-                loop.close()
             state["audio_buffer"] = []
         return state["orig"], state["trans"], state
@@ -196,29 +220,28 @@ def realtime_single_sync(audio, src, tgt, state):
         state["audio_buffer"].append(audio_array)
         # 버퍼가 충분히 쌓였을 때만 처리 (약 2-3초 분량)
-        buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
-        if buffer_duration >= 2.0:  # 2초마다 처리
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
             try:
                 # 버퍼의 오디오 합치기
                 combined_audio = np.concatenate(state["audio_buffer"])
                 audio_data = (sample_rate, combined_audio)
                 # STT
-                text = loop.run_until_complete(process_audio_chunk(audio_data, src))
                 if text:
                     state["orig"] = state["orig"] + " " + text if state["orig"] else text
                     # 번역
-                    trans = loop.run_until_complete(gpt_translate(text, src, tgt))
                     state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
                 # 버퍼 초기화
                 state["audio_buffer"] = []
-            finally:
-                loop.close()
     return state["orig"], state["trans"], state
@@ -231,26 +254,20 @@ def realtime_four_sync(audio, src, state):
     if audio is None:
         # 스트림 종료 시 남은 버퍼 처리
         if state["audio_buffer"] and state["sample_rate"]:
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
             try:
                 combined_audio = np.concatenate(state["audio_buffer"])
                 audio_data = (state["sample_rate"], combined_audio)
-                text = loop.run_until_complete(process_audio_chunk(audio_data, src))
                 if text:
                     state["orig"] = state["orig"] + " " + text if state["orig"] else text
-                    tasks = []
                     for lang in FOUR:
-                        tasks.append(gpt_translate(text, src, lang))
-                    translations = loop.run_until_complete(asyncio.gather(*tasks))
-                    for lang, trans in zip(FOUR, translations):
                         state[lang] = state[lang] + " " + trans if state[lang] else trans
-            finally:
-                loop.close()
             state["audio_buffer"] = []
         return (state["orig"], state["English"], state["Chinese"],
@@ -263,33 +280,27 @@ def realtime_four_sync(audio, src, state):
         state["audio_buffer"].append(audio_array)
         # 버퍼가 충분히 쌓였을 때만 처리
-        buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
-        if buffer_duration >= 2.0:  # 2초마다 처리
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
             try:
                 combined_audio = np.concatenate(state["audio_buffer"])
                 audio_data = (sample_rate, combined_audio)
                 # STT
-                text = loop.run_until_complete(process_audio_chunk(audio_data, src))
                 if text:
                     state["orig"] = state["orig"] + " " + text if state["orig"] else text
-                    # 4개 언어로 번역
-                    tasks = []
                     for lang in FOUR:
-                        tasks.append(gpt_translate(text, src, lang))
-                    translations = loop.run_until_complete(asyncio.gather(*tasks))
-                    for lang, trans in zip(FOUR, translations):
                         state[lang] = state[lang] + " " + trans if state[lang] else trans
                 state["audio_buffer"] = []
-            finally:
-                loop.close()
     return (state["orig"], state["English"], state["Chinese"],
             state["Thai"], state["Russian"], state)
@@ -347,8 +358,7 @@ with gr.Blocks(title="SMARTok Demo") as demo:
                 realtime_single_sync,
                 inputs=[mic3, src3, tgt3, st3],
                 outputs=[o3, t3, st3],
-                time_limit=30,  # 30초 제한
-                stream_every=0.5  # 0.5초마다 스트림
             )
         # 탭 4 – 실시간 4언어
@@ -381,7 +391,6 @@ with gr.Blocks(title="SMARTok Demo") as demo:
                 realtime_four_sync,
                 inputs=[mic4, src4, st4],
                 outputs=[o4, e4, c4, th4, r4, st4],
-                time_limit=30,
                 stream_every=0.5
             )

 WS_URL = "wss://api.openai.com/v1/realtime"  # 올바른 엔드포인트로 수정
 # ─── 1. 공통 GPT 번역 / TTS ─────────────────────────────────
+# 전역 클라이언트 관리
+client = None
+def get_client():
+    global client
+    if client is None:
+        client = openai.AsyncClient()
+    return client
 async def gpt_translate(text, src, tgt):
+    try:
+        client = get_client()
+        rsp = await client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[{"role":"system",
+                       "content":f"Translate {src} → {tgt}. Return only the text."},
+                      {"role":"user","content":text}],
+            temperature=0.3,max_tokens=2048)
+        return rsp.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"번역 오류: {e}")
+        return ""
 async def gpt_tts(text, lang):
+    try:
+        client = get_client()
+        rsp = await client.audio.speech.create(
+            model="tts-1", voice=VOICE[lang], input=text[:4096])
+        tmp = tempfile.NamedTemporaryFile(delete=False,suffix=".mp3")
+        tmp.write(rsp.content); tmp.close(); return tmp.name
+    except Exception as e:
+        print(f"TTS 오류: {e}")
+        return None
 # ─── 2. PDF 번역 ────────────────────────────────────────────
 def translate_pdf(file, src, tgt):
 async def translate_audio_async(file, src, tgt):
     if not file: return "⚠️ 오디오 업로드 필요", "", None
+    try:
+        # STT: Whisper API 사용
+        client = get_client()
+        with open(file, 'rb') as audio_file:
+            transcript = await client.audio.transcriptions.create(
+                model="whisper-1",
+                file=audio_file,
+                language=src[:2].lower()  # 언어 코드 간소화
+            )
+        orig_text = transcript.text
+        trans_text = await gpt_translate(orig_text, src, tgt)
+        audio_path = await gpt_tts(trans_text, tgt)
+        return orig_text, trans_text, audio_path
+    except Exception as e:
+        print(f"오디오 번역 오류: {e}")
+        return "⚠️ 번역 중 오류 발생", str(e), None
 def translate_audio(file, src, tgt):
     return asyncio.run(translate_audio_async(file, src, tgt))
             prompt = language_prompts.get(src_lang, "")
+            client = get_client()
+            transcript = await client.audio.transcriptions.create(
                 model="whisper-1",
                 file=audio_file,
                 language=src_lang[:2].lower(),
     if audio is None:
         # 스트림 종료 시 남은 버퍼 처리
         if state["audio_buffer"] and state["sample_rate"]:
             try:
                 # 버퍼의 오디오 합치기
                 combined_audio = np.concatenate(state["audio_buffer"])
                 audio_data = (state["sample_rate"], combined_audio)
+                # 비동기 작업 실행
+                text = asyncio.run(process_audio_chunk(audio_data, src))
                 if text:
                     state["orig"] = state["orig"] + " " + text if state["orig"] else text
+                    trans = asyncio.run(gpt_translate(text, src, tgt))
                     state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
+            except Exception as e:
+                print(f"처리 오류: {e}")
             state["audio_buffer"] = []
         return state["orig"], state["trans"], state
         state["audio_buffer"].append(audio_array)
         # 버퍼가 충분히 쌓였을 때만 처리 (약 2-3초 분량)
+        if state["audio_buffer"]:  # 버퍼가 비어있지 않은지 확인
+            buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
+            if buffer_duration >= 2.0:  # 2초마다 처리
             try:
                 # 버퍼의 오디오 합치기
                 combined_audio = np.concatenate(state["audio_buffer"])
                 audio_data = (sample_rate, combined_audio)
                 # STT
+                text = asyncio.run(process_audio_chunk(audio_data, src))
                 if text:
                     state["orig"] = state["orig"] + " " + text if state["orig"] else text
                     # 번역
+                    trans = asyncio.run(gpt_translate(text, src, tgt))
                     state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
                 # 버퍼 초기화
                 state["audio_buffer"] = []
+            except Exception as e:
+                print(f"처리 오류: {e}")
+                state["audio_buffer"] = []  # 오류 시에도 버퍼 초기화
     return state["orig"], state["trans"], state
     if audio is None:
         # 스트림 종료 시 남은 버퍼 처리
         if state["audio_buffer"] and state["sample_rate"]:
             try:
                 combined_audio = np.concatenate(state["audio_buffer"])
                 audio_data = (state["sample_rate"], combined_audio)
+                text = asyncio.run(process_audio_chunk(audio_data, src))
                 if text:
                     state["orig"] = state["orig"] + " " + text if state["orig"] else text
+                    # 순차적으로 번역 (병렬 처리 시 문제 발생 가능)
                     for lang in FOUR:
+                        trans = asyncio.run(gpt_translate(text, src, lang))
                         state[lang] = state[lang] + " " + trans if state[lang] else trans
+            except Exception as e:
+                print(f"처리 오류: {e}")
             state["audio_buffer"] = []
         return (state["orig"], state["English"], state["Chinese"],
         state["audio_buffer"].append(audio_array)
         # 버퍼가 충분히 쌓였을 때만 처리
+        if state["audio_buffer"]:  # 버퍼가 비어있지 않은지 확인
+            buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
+            if buffer_duration >= 2.0:  # 2초마다 처리
             try:
                 combined_audio = np.concatenate(state["audio_buffer"])
                 audio_data = (sample_rate, combined_audio)
                 # STT
+                text = asyncio.run(process_audio_chunk(audio_data, src))
                 if text:
                     state["orig"] = state["orig"] + " " + text if state["orig"] else text
+                    # 4개 언어로 순차 번역
                     for lang in FOUR:
+                        trans = asyncio.run(gpt_translate(text, src, lang))
                         state[lang] = state[lang] + " " + trans if state[lang] else trans
                 state["audio_buffer"] = []
+            except Exception as e:
+                print(f"처리 오류: {e}")
+                state["audio_buffer"] = []
     return (state["orig"], state["English"], state["Chinese"],
             state["Thai"], state["Russian"], state)
                 realtime_single_sync,
                 inputs=[mic3, src3, tgt3, st3],
                 outputs=[o3, t3, st3],
+                stream_every=0.5  # 0.5초마다 스트림 (time_limit 제거)
             )
         # 탭 4 – 실시간 4언어
                 realtime_four_sync,
                 inputs=[mic4, src4, st4],
                 outputs=[o4, e4, c4, th4, r4, st4],
                 stream_every=0.5
             )