Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -56,7 +56,7 @@ def get_vocals(input_file):
|
|
56 |
'data': [
|
57 |
{
|
58 |
'path': json_data[0],
|
59 |
-
'url': 'https://politrees-audio-separator-uvr.hf.space/gradio_api/file='+json_data[0],
|
60 |
'orig_name': pathlib.Path(input_file).name,
|
61 |
'size': file_len,
|
62 |
'mime_type': 'audio/wav',
|
@@ -135,17 +135,30 @@ def get_vocals(input_file):
|
|
135 |
return None
|
136 |
|
137 |
# -------------------------------
|
138 |
-
# Normalization Function
|
139 |
# -------------------------------
|
140 |
-
def
|
141 |
"""
|
142 |
-
|
143 |
-
a
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
"""
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
149 |
return normalized_audio
|
150 |
|
151 |
# -------------------------------
|
@@ -207,7 +220,6 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
|
|
207 |
debug_log.append("Vocal extraction succeeded; downloading extracted audio...")
|
208 |
response = requests.get(extracted_url)
|
209 |
if response.status_code == 200:
|
210 |
-
# Write to a temporary file
|
211 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
|
212 |
tmp.write(response.content)
|
213 |
audio_file = tmp.name
|
@@ -221,26 +233,26 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
|
|
221 |
audio, sr = librosa.load(audio_file, sr=16000)
|
222 |
debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
|
223 |
|
224 |
-
# If
|
225 |
if vocal_extraction:
|
226 |
-
audio =
|
227 |
-
debug_log.append("
|
228 |
|
229 |
# Select the model and set batch size
|
230 |
model = models[model_size]
|
231 |
batch_size = 8 if model_size == "tiny" else 4
|
232 |
|
233 |
-
# Use
|
234 |
if language:
|
235 |
transcript = model.transcribe(audio, batch_size=batch_size, language=language)
|
236 |
else:
|
237 |
transcript = model.transcribe(audio, batch_size=batch_size)
|
238 |
language = transcript.get("language", "unknown")
|
239 |
|
240 |
-
# Load alignment model using the specified
|
241 |
model_a, metadata = whisperx.load_align_model(language_code=language, device=device)
|
242 |
|
243 |
-
# If pause_threshold > 0, split
|
244 |
if pause_threshold > 0:
|
245 |
segments = split_audio_by_pause(audio, sr, pause_threshold)
|
246 |
debug_log.append(f"Audio split into {len(segments)} segment(s) using a pause threshold of {pause_threshold}s")
|
@@ -307,12 +319,10 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
|
|
307 |
interactive=True,
|
308 |
info="Set a pause duration threshold. Audio pauses longer than this will be used to split the audio into segments."
|
309 |
)
|
310 |
-
# New input for vocal extraction feature
|
311 |
vocal_extraction_checkbox = gr.Checkbox(
|
312 |
label="Extract Vocals (improves accuracy on noisy audio)",
|
313 |
value=False
|
314 |
)
|
315 |
-
# New language selection (default English)
|
316 |
language_input = gr.Textbox(
|
317 |
label="Language Code (e.g., en, es, fr)",
|
318 |
placeholder="Enter language code",
|
@@ -334,7 +344,6 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
|
|
334 |
visible=False,
|
335 |
)
|
336 |
|
337 |
-
# Toggle debug visibility
|
338 |
def toggle_debug(debug_enabled):
|
339 |
return gr.update(visible=debug_enabled)
|
340 |
|
@@ -344,7 +353,6 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
|
|
344 |
outputs=[debug_output]
|
345 |
)
|
346 |
|
347 |
-
# Process transcription with all new parameters
|
348 |
transcribe_btn.click(
|
349 |
transcribe,
|
350 |
inputs=[audio_input, model_selector, debug_checkbox, pause_threshold_slider, vocal_extraction_checkbox, language_input],
|
|
|
56 |
'data': [
|
57 |
{
|
58 |
'path': json_data[0],
|
59 |
+
'url': 'https://politrees-audio-separator-uvr.hf.space/gradio_api/file=' + json_data[0],
|
60 |
'orig_name': pathlib.Path(input_file).name,
|
61 |
'size': file_len,
|
62 |
'mime_type': 'audio/wav',
|
|
|
135 |
return None
|
136 |
|
137 |
# -------------------------------
|
138 |
+
# Advanced Normalization Function
|
139 |
# -------------------------------
|
140 |
+
def advanced_normalize_audio(audio, threshold_ratio=0.6, window_size=1024):
|
141 |
"""
|
142 |
+
This advanced normalization function computes a moving-average envelope of the absolute
|
143 |
+
audio signal using a specified window size. It then zeroes out portions of the signal
|
144 |
+
where the envelope falls below a threshold (defined as a ratio of the maximum envelope value).
|
145 |
+
|
146 |
+
Parameters:
|
147 |
+
audio (np.ndarray): Input audio signal.
|
148 |
+
threshold_ratio (float): Ratio (0-1) to determine the minimum envelope value to keep.
|
149 |
+
window_size (int): Size of the moving window used to compute the envelope.
|
150 |
+
|
151 |
+
Returns:
|
152 |
+
np.ndarray: The normalized audio signal.
|
153 |
"""
|
154 |
+
# Compute moving-average envelope
|
155 |
+
envelope = np.convolve(np.abs(audio), np.ones(window_size) / window_size, mode='same')
|
156 |
+
max_env = np.max(envelope)
|
157 |
+
threshold = threshold_ratio * max_env
|
158 |
+
# Create a mask: keep samples where the envelope meets or exceeds the threshold.
|
159 |
+
mask = envelope >= threshold
|
160 |
+
# Optionally, you might smooth the mask further to avoid abrupt cuts.
|
161 |
+
normalized_audio = audio * mask.astype(audio.dtype)
|
162 |
return normalized_audio
|
163 |
|
164 |
# -------------------------------
|
|
|
220 |
debug_log.append("Vocal extraction succeeded; downloading extracted audio...")
|
221 |
response = requests.get(extracted_url)
|
222 |
if response.status_code == 200:
|
|
|
223 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
|
224 |
tmp.write(response.content)
|
225 |
audio_file = tmp.name
|
|
|
233 |
audio, sr = librosa.load(audio_file, sr=16000)
|
234 |
debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
|
235 |
|
236 |
+
# If vocal extraction was used, apply advanced normalization
|
237 |
if vocal_extraction:
|
238 |
+
audio = advanced_normalize_audio(audio)
|
239 |
+
debug_log.append("Advanced normalization applied to extracted audio to remove low-amplitude segments.")
|
240 |
|
241 |
# Select the model and set batch size
|
242 |
model = models[model_size]
|
243 |
batch_size = 8 if model_size == "tiny" else 4
|
244 |
|
245 |
+
# Use provided language if set; otherwise, use language detection.
|
246 |
if language:
|
247 |
transcript = model.transcribe(audio, batch_size=batch_size, language=language)
|
248 |
else:
|
249 |
transcript = model.transcribe(audio, batch_size=batch_size)
|
250 |
language = transcript.get("language", "unknown")
|
251 |
|
252 |
+
# Load alignment model using the specified language
|
253 |
model_a, metadata = whisperx.load_align_model(language_code=language, device=device)
|
254 |
|
255 |
+
# If pause_threshold > 0, split audio and process segments individually
|
256 |
if pause_threshold > 0:
|
257 |
segments = split_audio_by_pause(audio, sr, pause_threshold)
|
258 |
debug_log.append(f"Audio split into {len(segments)} segment(s) using a pause threshold of {pause_threshold}s")
|
|
|
319 |
interactive=True,
|
320 |
info="Set a pause duration threshold. Audio pauses longer than this will be used to split the audio into segments."
|
321 |
)
|
|
|
322 |
vocal_extraction_checkbox = gr.Checkbox(
|
323 |
label="Extract Vocals (improves accuracy on noisy audio)",
|
324 |
value=False
|
325 |
)
|
|
|
326 |
language_input = gr.Textbox(
|
327 |
label="Language Code (e.g., en, es, fr)",
|
328 |
placeholder="Enter language code",
|
|
|
344 |
visible=False,
|
345 |
)
|
346 |
|
|
|
347 |
def toggle_debug(debug_enabled):
|
348 |
return gr.update(visible=debug_enabled)
|
349 |
|
|
|
353 |
outputs=[debug_output]
|
354 |
)
|
355 |
|
|
|
356 |
transcribe_btn.click(
|
357 |
transcribe,
|
358 |
inputs=[audio_input, model_selector, debug_checkbox, pause_threshold_slider, vocal_extraction_checkbox, language_input],
|