openfree commited on
Commit
8000eeb
Β·
verified Β·
1 Parent(s): 4e89e7e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -39
app.py CHANGED
@@ -2,6 +2,8 @@ import os, asyncio, json, tempfile, websockets, pdfplumber
2
  import gradio as gr
3
  import openai
4
  from dotenv import load_dotenv
 
 
5
 
6
  # ─── 0. μ΄ˆκΈ°ν™” ───────────────────────────────────────────────
7
  load_dotenv()
@@ -69,10 +71,29 @@ async def process_audio_chunk(audio_data, src_lang):
69
  return ""
70
 
71
  try:
72
- # μž„μ‹œ 파일둜 μ €μž₯
73
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
74
- tmp.write(audio_data)
75
- tmp_path = tmp.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # Whisper API둜 λ³€ν™˜
78
  with open(tmp_path, 'rb') as audio_file:
@@ -92,58 +113,130 @@ async def process_audio_chunk(audio_data, src_lang):
92
  def realtime_single_sync(audio, src, tgt, state):
93
  """동기 λ²„μ „μ˜ μ‹€μ‹œκ°„ 단일 μ–Έμ–΄ λ²ˆμ—­"""
94
  if state is None:
95
- state = {"orig": "", "trans": ""}
96
 
97
  if audio is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  return state["orig"], state["trans"], state
99
 
100
- # 비동기 μž‘μ—…μ„ λ™κΈ°μ μœΌλ‘œ μ‹€ν–‰
101
- loop = asyncio.new_event_loop()
102
- asyncio.set_event_loop(loop)
103
-
104
- try:
105
- # STT
106
- text = loop.run_until_complete(process_audio_chunk(audio, src))
107
- if text:
108
- state["orig"] = state["orig"] + " " + text if state["orig"] else text
 
 
109
 
110
- # λ²ˆμ—­
111
- trans = loop.run_until_complete(gpt_translate(text, src, tgt))
112
- state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
113
- finally:
114
- loop.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  return state["orig"], state["trans"], state
117
 
118
  def realtime_four_sync(audio, src, state):
119
  """동기 λ²„μ „μ˜ μ‹€μ‹œκ°„ 4μ–Έμ–΄ λ²ˆμ—­"""
120
  if state is None:
121
- state = {"orig": "", "English": "", "Chinese": "", "Thai": "", "Russian": ""}
 
122
 
123
  if audio is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  return (state["orig"], state["English"], state["Chinese"],
125
  state["Thai"], state["Russian"], state)
126
 
127
- loop = asyncio.new_event_loop()
128
- asyncio.set_event_loop(loop)
129
-
130
- try:
131
- # STT
132
- text = loop.run_until_complete(process_audio_chunk(audio, src))
133
- if text:
134
- state["orig"] = state["orig"] + " " + text if state["orig"] else text
135
-
136
- # 4개 μ–Έμ–΄λ‘œ λ²ˆμ—­
137
- tasks = []
138
- for lang in FOUR:
139
- tasks.append(gpt_translate(text, src, lang))
140
-
141
- translations = loop.run_until_complete(asyncio.gather(*tasks))
142
 
143
- for lang, trans in zip(FOUR, translations):
144
- state[lang] = state[lang] + " " + trans if state[lang] else trans
145
- finally:
146
- loop.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  return (state["orig"], state["English"], state["Chinese"],
149
  state["Thai"], state["Russian"], state)
 
2
  import gradio as gr
3
  import openai
4
  from dotenv import load_dotenv
5
+ import numpy as np
6
+ import wave
7
 
8
  # ─── 0. μ΄ˆκΈ°ν™” ───────────────────────────────────────────────
9
  load_dotenv()
 
71
  return ""
72
 
73
  try:
74
+ # GradioλŠ” (sample_rate, audio_array) νŠœν”Œμ„ λ°˜ν™˜
75
+ if isinstance(audio_data, tuple):
76
+ sample_rate, audio_array = audio_data
77
+ # numpy arrayλ₯Ό WAV 파일둜 λ³€ν™˜
78
+ import numpy as np
79
+ import wave
80
+
81
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
82
+ with wave.open(tmp.name, 'wb') as wav_file:
83
+ wav_file.setnchannels(1) # mono
84
+ wav_file.setsampwidth(2) # 16-bit
85
+ wav_file.setframerate(sample_rate)
86
+
87
+ # numpy arrayλ₯Ό 16-bit PCM으둜 λ³€ν™˜
88
+ if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
89
+ audio_array = (audio_array * 32767).astype(np.int16)
90
+ wav_file.writeframes(audio_array.tobytes())
91
+ tmp_path = tmp.name
92
+ else:
93
+ # bytes 데이터인 경우
94
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
95
+ tmp.write(audio_data)
96
+ tmp_path = tmp.name
97
 
98
  # Whisper API둜 λ³€ν™˜
99
  with open(tmp_path, 'rb') as audio_file:
 
113
  def realtime_single_sync(audio, src, tgt, state):
114
  """동기 λ²„μ „μ˜ μ‹€μ‹œκ°„ 단일 μ–Έμ–΄ λ²ˆμ—­"""
115
  if state is None:
116
+ state = {"orig": "", "trans": "", "audio_buffer": [], "sample_rate": None}
117
 
118
  if audio is None:
119
+ # 슀트림 μ’…λ£Œ μ‹œ 남은 버퍼 처리
120
+ if state["audio_buffer"] and state["sample_rate"]:
121
+ loop = asyncio.new_event_loop()
122
+ asyncio.set_event_loop(loop)
123
+ try:
124
+ # λ²„νΌμ˜ μ˜€λ””μ˜€ ν•©μΉ˜κΈ°
125
+ combined_audio = np.concatenate(state["audio_buffer"])
126
+ audio_data = (state["sample_rate"], combined_audio)
127
+
128
+ text = loop.run_until_complete(process_audio_chunk(audio_data, src))
129
+ if text:
130
+ state["orig"] = state["orig"] + " " + text if state["orig"] else text
131
+ trans = loop.run_until_complete(gpt_translate(text, src, tgt))
132
+ state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
133
+ finally:
134
+ loop.close()
135
+ state["audio_buffer"] = []
136
+
137
  return state["orig"], state["trans"], state
138
 
139
+ # μ˜€λ””μ˜€ 데이터 버퍼링
140
+ if isinstance(audio, tuple):
141
+ sample_rate, audio_array = audio
142
+ state["sample_rate"] = sample_rate
143
+ state["audio_buffer"].append(audio_array)
144
+
145
+ # 버퍼가 μΆ©λΆ„νžˆ μŒ“μ˜€μ„ λ•Œλ§Œ 처리 (μ•½ 1-2초 λΆ„λŸ‰)
146
+ buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
147
+ if buffer_duration >= 1.5: # 1.5μ΄ˆλ§ˆλ‹€ 처리
148
+ loop = asyncio.new_event_loop()
149
+ asyncio.set_event_loop(loop)
150
 
151
+ try:
152
+ # λ²„νΌμ˜ μ˜€λ””μ˜€ ν•©μΉ˜κΈ°
153
+ combined_audio = np.concatenate(state["audio_buffer"])
154
+ audio_data = (sample_rate, combined_audio)
155
+
156
+ # STT
157
+ text = loop.run_until_complete(process_audio_chunk(audio_data, src))
158
+ if text:
159
+ state["orig"] = state["orig"] + " " + text if state["orig"] else text
160
+
161
+ # λ²ˆμ—­
162
+ trans = loop.run_until_complete(gpt_translate(text, src, tgt))
163
+ state["trans"] = state["trans"] + " " + trans if state["trans"] else trans
164
+
165
+ # 버퍼 μ΄ˆκΈ°ν™”
166
+ state["audio_buffer"] = []
167
+ finally:
168
+ loop.close()
169
 
170
  return state["orig"], state["trans"], state
171
 
172
  def realtime_four_sync(audio, src, state):
173
  """동기 λ²„μ „μ˜ μ‹€μ‹œκ°„ 4μ–Έμ–΄ λ²ˆμ—­"""
174
  if state is None:
175
+ state = {"orig": "", "English": "", "Chinese": "", "Thai": "", "Russian": "",
176
+ "audio_buffer": [], "sample_rate": None}
177
 
178
  if audio is None:
179
+ # 슀트림 μ’…λ£Œ μ‹œ 남은 버퍼 처리
180
+ if state["audio_buffer"] and state["sample_rate"]:
181
+ loop = asyncio.new_event_loop()
182
+ asyncio.set_event_loop(loop)
183
+ try:
184
+ combined_audio = np.concatenate(state["audio_buffer"])
185
+ audio_data = (state["sample_rate"], combined_audio)
186
+
187
+ text = loop.run_until_complete(process_audio_chunk(audio_data, src))
188
+ if text:
189
+ state["orig"] = state["orig"] + " " + text if state["orig"] else text
190
+
191
+ tasks = []
192
+ for lang in FOUR:
193
+ tasks.append(gpt_translate(text, src, lang))
194
+
195
+ translations = loop.run_until_complete(asyncio.gather(*tasks))
196
+
197
+ for lang, trans in zip(FOUR, translations):
198
+ state[lang] = state[lang] + " " + trans if state[lang] else trans
199
+ finally:
200
+ loop.close()
201
+ state["audio_buffer"] = []
202
+
203
  return (state["orig"], state["English"], state["Chinese"],
204
  state["Thai"], state["Russian"], state)
205
 
206
+ # μ˜€λ””μ˜€ 데이터 버퍼링
207
+ if isinstance(audio, tuple):
208
+ sample_rate, audio_array = audio
209
+ state["sample_rate"] = sample_rate
210
+ state["audio_buffer"].append(audio_array)
211
+
212
+ # 버퍼가 μΆ©λΆ„νžˆ μŒ“μ˜€μ„ λ•Œλ§Œ 처리
213
+ buffer_duration = len(np.concatenate(state["audio_buffer"])) / sample_rate
214
+ if buffer_duration >= 1.5: # 1.5μ΄ˆλ§ˆλ‹€ 처리
215
+ loop = asyncio.new_event_loop()
216
+ asyncio.set_event_loop(loop)
 
 
 
 
217
 
218
+ try:
219
+ combined_audio = np.concatenate(state["audio_buffer"])
220
+ audio_data = (sample_rate, combined_audio)
221
+
222
+ # STT
223
+ text = loop.run_until_complete(process_audio_chunk(audio_data, src))
224
+ if text:
225
+ state["orig"] = state["orig"] + " " + text if state["orig"] else text
226
+
227
+ # 4개 μ–Έμ–΄λ‘œ λ²ˆμ—­
228
+ tasks = []
229
+ for lang in FOUR:
230
+ tasks.append(gpt_translate(text, src, lang))
231
+
232
+ translations = loop.run_until_complete(asyncio.gather(*tasks))
233
+
234
+ for lang, trans in zip(FOUR, translations):
235
+ state[lang] = state[lang] + " " + trans if state[lang] else trans
236
+
237
+ state["audio_buffer"] = []
238
+ finally:
239
+ loop.close()
240
 
241
  return (state["orig"], state["English"], state["Chinese"],
242
  state["Thai"], state["Russian"], state)