Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,8 @@ import os, asyncio, json, tempfile, websockets, pdfplumber
|
|
2 |
import gradio as gr
|
3 |
import openai
|
4 |
from dotenv import load_dotenv
|
|
|
|
|
5 |
|
6 |
# βββ 0. μ΄κΈ°ν βββββββββββββββββββββββββββββββββββββββββββββββ
|
7 |
load_dotenv()
|
@@ -69,10 +71,29 @@ async def process_audio_chunk(audio_data, src_lang):
|
|
69 |
return ""
|
70 |
|
71 |
try:
|
72 |
-
#
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
# Whisper APIλ‘ λ³ν
|
78 |
with open(tmp_path, 'rb') as audio_file:
|
@@ -92,58 +113,130 @@ async def process_audio_chunk(audio_data, src_lang):
|
|
92 |
def realtime_single_sync(audio, src, tgt, state):
|
93 |
"""λκΈ° λ²μ μ μ€μκ° λ¨μΌ μΈμ΄ λ²μ"""
|
94 |
if state is None:
|
95 |
-
state = {"orig": "", "trans": ""}
|
96 |
|
97 |
if audio is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
return state["orig"], state["trans"], state
|
99 |
|
100 |
-
#
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
109 |
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
return state["orig"], state["trans"], state
|
117 |
|
118 |
def realtime_four_sync(audio, src, state):
|
119 |
"""λκΈ° λ²μ μ μ€μκ° 4μΈμ΄ λ²μ"""
|
120 |
if state is None:
|
121 |
-
state = {"orig": "", "English": "", "Chinese": "", "Thai": "", "Russian": ""
|
|
|
122 |
|
123 |
if audio is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
return (state["orig"], state["English"], state["Chinese"],
|
125 |
state["Thai"], state["Russian"], state)
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
for lang in FOUR:
|
139 |
-
tasks.append(gpt_translate(text, src, lang))
|
140 |
-
|
141 |
-
translations = loop.run_until_complete(asyncio.gather(*tasks))
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
return (state["orig"], state["English"], state["Chinese"],
|
149 |
state["Thai"], state["Russian"], state)
|
|
|
2 |
import gradio as gr
|
3 |
import openai
|
4 |
from dotenv import load_dotenv
|
5 |
+
import numpy as np
|
6 |
+
import wave
|
7 |
|
8 |
# βββ 0. μ΄κΈ°ν βββββββββββββββββββββββββββββββββββββββββββββββ
|
9 |
load_dotenv()
|
|
|
71 |
return ""
|
72 |
|
73 |
try:
|
74 |
+
# Gradioλ (sample_rate, audio_array) ννμ λ°ν
|
75 |
+
if isinstance(audio_data, tuple):
|
76 |
+
sample_rate, audio_array = audio_data
|
77 |
+
# numpy arrayλ₯Ό WAV νμΌλ‘ λ³ν
|
78 |
+
import numpy as np
|
79 |
+
import wave
|
80 |
+
|
81 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
82 |
+
with wave.open(tmp.name, 'wb') as wav_file:
|
83 |
+
wav_file.setnchannels(1) # mono
|
84 |
+
wav_file.setsampwidth(2) # 16-bit
|
85 |
+
wav_file.setframerate(sample_rate)
|
86 |
+
|
87 |
+
# numpy arrayλ₯Ό 16-bit PCMμΌλ‘ λ³ν
|
88 |
+
if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
|
89 |
+
audio_array = (audio_array * 32767).astype(np.int16)
|
90 |
+
wav_file.writeframes(audio_array.tobytes())
|
91 |
+
tmp_path = tmp.name
|
92 |
+
else:
|
93 |
+
# bytes λ°μ΄ν°μΈ κ²½μ°
|
94 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
95 |
+
tmp.write(audio_data)
|
96 |
+
tmp_path = tmp.name
|
97 |
|
98 |
# Whisper APIλ‘ λ³ν
|
99 |
with open(tmp_path, 'rb') as audio_file:
|
|
|
113 |
def realtime_single_sync(audio, src, tgt, state):
|
114 |
"""λκΈ° λ²μ μ μ€μκ° λ¨μΌ μΈμ΄ λ²μ"""
|
115 |
if state is None:
|
116 |
+
state = {"orig": "", "trans": "", "audio_buffer": [], "sample_rate": None}
|
117 |
|
118 |
if audio is None:
|
119 |
+
# μ€νΈλ¦Ό μ’
λ£ μ λ¨μ λ²νΌ μ²λ¦¬
|
120 |
+
if state["audio_buffer"] and state["sample_rate"]:
|
121 |
+
loop = asyncio.new_event_loop()
|
122 |
+
asyncio.set_event_loop(loop)
|
123 |
+
try:
|
124 |
+
# λ²νΌμ μ€λμ€ ν©μΉκΈ°
|
125 |
+
combined_audio = np.concatenate(state["audio_buffer"])
|
126 |
+
audio_data = (state["sample_rate"], combined_audio)
|
127 |
+
|
128 |
+
text = loop.run_until_complete(process_audio_chunk(audio_data, src))
|
129 |
+
if text:
|
130 |
+
state["orig"] = state["orig"] + " " + text if state["orig"] else text
|
131 |
+
trans = loop.run_until_complete(gpt_translate(text, src, tgt))
|
132 |
+
state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
|
133 |
+
finally:
|
134 |
+
loop.close()
|
135 |
+
state["audio_buffer"] = []
|
136 |
+
|
137 |
return state["orig"], state["trans"], state
|
138 |
|
139 |
+
# μ€λμ€ λ°μ΄ν° λ²νΌλ§
|
140 |
+
if isinstance(audio, tuple):
|
141 |
+
sample_rate, audio_array = audio
|
142 |
+
state["sample_rate"] = sample_rate
|
143 |
+
state["audio_buffer"].append(audio_array)
|
144 |
+
|
145 |
+
# λ²νΌκ° μΆ©λΆν μμμ λλ§ μ²λ¦¬ (μ½ 1-2μ΄ λΆλ)
|
146 |
+
buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
|
147 |
+
if buffer_duration >= 1.5: # 1.5μ΄λ§λ€ μ²λ¦¬
|
148 |
+
loop = asyncio.new_event_loop()
|
149 |
+
asyncio.set_event_loop(loop)
|
150 |
|
151 |
+
try:
|
152 |
+
# λ²νΌμ μ€λμ€ ν©μΉκΈ°
|
153 |
+
combined_audio = np.concatenate(state["audio_buffer"])
|
154 |
+
audio_data = (sample_rate, combined_audio)
|
155 |
+
|
156 |
+
# STT
|
157 |
+
text = loop.run_until_complete(process_audio_chunk(audio_data, src))
|
158 |
+
if text:
|
159 |
+
state["orig"] = state["orig"] + " " + text if state["orig"] else text
|
160 |
+
|
161 |
+
# λ²μ
|
162 |
+
trans = loop.run_until_complete(gpt_translate(text, src, tgt))
|
163 |
+
state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
|
164 |
+
|
165 |
+
# λ²νΌ μ΄κΈ°ν
|
166 |
+
state["audio_buffer"] = []
|
167 |
+
finally:
|
168 |
+
loop.close()
|
169 |
|
170 |
return state["orig"], state["trans"], state
|
171 |
|
172 |
def realtime_four_sync(audio, src, state):
|
173 |
"""λκΈ° λ²μ μ μ€μκ° 4μΈμ΄ λ²μ"""
|
174 |
if state is None:
|
175 |
+
state = {"orig": "", "English": "", "Chinese": "", "Thai": "", "Russian": "",
|
176 |
+
"audio_buffer": [], "sample_rate": None}
|
177 |
|
178 |
if audio is None:
|
179 |
+
# μ€νΈλ¦Ό μ’
λ£ μ λ¨μ λ²νΌ μ²λ¦¬
|
180 |
+
if state["audio_buffer"] and state["sample_rate"]:
|
181 |
+
loop = asyncio.new_event_loop()
|
182 |
+
asyncio.set_event_loop(loop)
|
183 |
+
try:
|
184 |
+
combined_audio = np.concatenate(state["audio_buffer"])
|
185 |
+
audio_data = (state["sample_rate"], combined_audio)
|
186 |
+
|
187 |
+
text = loop.run_until_complete(process_audio_chunk(audio_data, src))
|
188 |
+
if text:
|
189 |
+
state["orig"] = state["orig"] + " " + text if state["orig"] else text
|
190 |
+
|
191 |
+
tasks = []
|
192 |
+
for lang in FOUR:
|
193 |
+
tasks.append(gpt_translate(text, src, lang))
|
194 |
+
|
195 |
+
translations = loop.run_until_complete(asyncio.gather(*tasks))
|
196 |
+
|
197 |
+
for lang, trans in zip(FOUR, translations):
|
198 |
+
state[lang] = state[lang] + " " + trans if state[lang] else trans
|
199 |
+
finally:
|
200 |
+
loop.close()
|
201 |
+
state["audio_buffer"] = []
|
202 |
+
|
203 |
return (state["orig"], state["English"], state["Chinese"],
|
204 |
state["Thai"], state["Russian"], state)
|
205 |
|
206 |
+
# μ€λμ€ λ°μ΄ν° λ²νΌλ§
|
207 |
+
if isinstance(audio, tuple):
|
208 |
+
sample_rate, audio_array = audio
|
209 |
+
state["sample_rate"] = sample_rate
|
210 |
+
state["audio_buffer"].append(audio_array)
|
211 |
+
|
212 |
+
# λ²νΌκ° μΆ©λΆν μμμ λλ§ μ²λ¦¬
|
213 |
+
buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
|
214 |
+
if buffer_duration >= 1.5: # 1.5μ΄λ§λ€ μ²λ¦¬
|
215 |
+
loop = asyncio.new_event_loop()
|
216 |
+
asyncio.set_event_loop(loop)
|
|
|
|
|
|
|
|
|
217 |
|
218 |
+
try:
|
219 |
+
combined_audio = np.concatenate(state["audio_buffer"])
|
220 |
+
audio_data = (sample_rate, combined_audio)
|
221 |
+
|
222 |
+
# STT
|
223 |
+
text = loop.run_until_complete(process_audio_chunk(audio_data, src))
|
224 |
+
if text:
|
225 |
+
state["orig"] = state["orig"] + " " + text if state["orig"] else text
|
226 |
+
|
227 |
+
# 4κ° μΈμ΄λ‘ λ²μ
|
228 |
+
tasks = []
|
229 |
+
for lang in FOUR:
|
230 |
+
tasks.append(gpt_translate(text, src, lang))
|
231 |
+
|
232 |
+
translations = loop.run_until_complete(asyncio.gather(*tasks))
|
233 |
+
|
234 |
+
for lang, trans in zip(FOUR, translations):
|
235 |
+
state[lang] = state[lang] + " " + trans if state[lang] else trans
|
236 |
+
|
237 |
+
state["audio_buffer"] = []
|
238 |
+
finally:
|
239 |
+
loop.close()
|
240 |
|
241 |
return (state["orig"], state["English"], state["Chinese"],
|
242 |
state["Thai"], state["Russian"], state)
|