sdafd commited on
Commit
ed7cca2
·
verified ·
1 Parent(s): 7a3ea68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -20
app.py CHANGED
@@ -56,7 +56,7 @@ def get_vocals(input_file):
56
  'data': [
57
  {
58
  'path': json_data[0],
59
- 'url': 'https://politrees-audio-separator-uvr.hf.space/gradio_api/file='+json_data[0],
60
  'orig_name': pathlib.Path(input_file).name,
61
  'size': file_len,
62
  'mime_type': 'audio/wav',
@@ -135,17 +135,30 @@ def get_vocals(input_file):
135
  return None
136
 
137
  # -------------------------------
138
- # Normalization Function
139
  # -------------------------------
140
- def normalize_audio(audio, threshold_ratio=0.6):
141
  """
142
- Given an audio signal (numpy array), set to 0 any samples that are below
143
- a given ratio of the maximum absolute amplitude. This is a simple way to
144
- suppress relatively quieter (background) parts.
 
 
 
 
 
 
 
 
145
  """
146
- max_val = np.max(np.abs(audio))
147
- threshold = threshold_ratio * max_val
148
- normalized_audio = np.where(np.abs(audio) >= threshold, audio, 0)
 
 
 
 
 
149
  return normalized_audio
150
 
151
  # -------------------------------
@@ -207,7 +220,6 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
207
  debug_log.append("Vocal extraction succeeded; downloading extracted audio...")
208
  response = requests.get(extracted_url)
209
  if response.status_code == 200:
210
- # Write to a temporary file
211
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
212
  tmp.write(response.content)
213
  audio_file = tmp.name
@@ -221,26 +233,26 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
221
  audio, sr = librosa.load(audio_file, sr=16000)
222
  debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
223
 
224
- # If we used vocal extraction, apply normalization to remove low-amplitude (background) parts
225
  if vocal_extraction:
226
- audio = normalize_audio(audio)
227
- debug_log.append("Normalization applied to extracted audio to remove low-amplitude segments.")
228
 
229
  # Select the model and set batch size
230
  model = models[model_size]
231
  batch_size = 8 if model_size == "tiny" else 4
232
 
233
- # Use the provided language if set; otherwise, let the model detect the language.
234
  if language:
235
  transcript = model.transcribe(audio, batch_size=batch_size, language=language)
236
  else:
237
  transcript = model.transcribe(audio, batch_size=batch_size)
238
  language = transcript.get("language", "unknown")
239
 
240
- # Load alignment model using the specified/overridden language
241
  model_a, metadata = whisperx.load_align_model(language_code=language, device=device)
242
 
243
- # If pause_threshold > 0, split the audio and process segments individually
244
  if pause_threshold > 0:
245
  segments = split_audio_by_pause(audio, sr, pause_threshold)
246
  debug_log.append(f"Audio split into {len(segments)} segment(s) using a pause threshold of {pause_threshold}s")
@@ -307,12 +319,10 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
307
  interactive=True,
308
  info="Set a pause duration threshold. Audio pauses longer than this will be used to split the audio into segments."
309
  )
310
- # New input for vocal extraction feature
311
  vocal_extraction_checkbox = gr.Checkbox(
312
  label="Extract Vocals (improves accuracy on noisy audio)",
313
  value=False
314
  )
315
- # New language selection (default English)
316
  language_input = gr.Textbox(
317
  label="Language Code (e.g., en, es, fr)",
318
  placeholder="Enter language code",
@@ -334,7 +344,6 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
334
  visible=False,
335
  )
336
 
337
- # Toggle debug visibility
338
  def toggle_debug(debug_enabled):
339
  return gr.update(visible=debug_enabled)
340
 
@@ -344,7 +353,6 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
344
  outputs=[debug_output]
345
  )
346
 
347
- # Process transcription with all new parameters
348
  transcribe_btn.click(
349
  transcribe,
350
  inputs=[audio_input, model_selector, debug_checkbox, pause_threshold_slider, vocal_extraction_checkbox, language_input],
 
56
  'data': [
57
  {
58
  'path': json_data[0],
59
+ 'url': 'https://politrees-audio-separator-uvr.hf.space/gradio_api/file=' + json_data[0],
60
  'orig_name': pathlib.Path(input_file).name,
61
  'size': file_len,
62
  'mime_type': 'audio/wav',
 
135
  return None
136
 
137
  # -------------------------------
138
+ # Advanced Normalization Function
139
  # -------------------------------
140
+ def advanced_normalize_audio(audio, threshold_ratio=0.6, window_size=1024):
141
  """
142
+ This advanced normalization function computes a moving-average envelope of the absolute
143
+ audio signal using a specified window size. It then zeroes out portions of the signal
144
+ where the envelope falls below a threshold (defined as a ratio of the maximum envelope value).
145
+
146
+ Parameters:
147
+ audio (np.ndarray): Input audio signal.
148
+ threshold_ratio (float): Ratio (0-1) to determine the minimum envelope value to keep.
149
+ window_size (int): Size of the moving window used to compute the envelope.
150
+
151
+ Returns:
152
+ np.ndarray: The normalized audio signal.
153
  """
154
+ # Compute moving-average envelope
155
+ envelope = np.convolve(np.abs(audio), np.ones(window_size) / window_size, mode='same')
156
+ max_env = np.max(envelope)
157
+ threshold = threshold_ratio * max_env
158
+ # Create a mask: keep samples where the envelope meets or exceeds the threshold.
159
+ mask = envelope >= threshold
160
+ # Optionally, you might smooth the mask further to avoid abrupt cuts.
161
+ normalized_audio = audio * mask.astype(audio.dtype)
162
  return normalized_audio
163
 
164
  # -------------------------------
 
220
  debug_log.append("Vocal extraction succeeded; downloading extracted audio...")
221
  response = requests.get(extracted_url)
222
  if response.status_code == 200:
 
223
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
224
  tmp.write(response.content)
225
  audio_file = tmp.name
 
233
  audio, sr = librosa.load(audio_file, sr=16000)
234
  debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
235
 
236
+ # If vocal extraction was used, apply advanced normalization
237
  if vocal_extraction:
238
+ audio = advanced_normalize_audio(audio)
239
+ debug_log.append("Advanced normalization applied to extracted audio to remove low-amplitude segments.")
240
 
241
  # Select the model and set batch size
242
  model = models[model_size]
243
  batch_size = 8 if model_size == "tiny" else 4
244
 
245
+ # Use provided language if set; otherwise, use language detection.
246
  if language:
247
  transcript = model.transcribe(audio, batch_size=batch_size, language=language)
248
  else:
249
  transcript = model.transcribe(audio, batch_size=batch_size)
250
  language = transcript.get("language", "unknown")
251
 
252
+ # Load alignment model using the specified language
253
  model_a, metadata = whisperx.load_align_model(language_code=language, device=device)
254
 
255
+ # If pause_threshold > 0, split audio and process segments individually
256
  if pause_threshold > 0:
257
  segments = split_audio_by_pause(audio, sr, pause_threshold)
258
  debug_log.append(f"Audio split into {len(segments)} segment(s) using a pause threshold of {pause_threshold}s")
 
319
  interactive=True,
320
  info="Set a pause duration threshold. Audio pauses longer than this will be used to split the audio into segments."
321
  )
 
322
  vocal_extraction_checkbox = gr.Checkbox(
323
  label="Extract Vocals (improves accuracy on noisy audio)",
324
  value=False
325
  )
 
326
  language_input = gr.Textbox(
327
  label="Language Code (e.g., en, es, fr)",
328
  placeholder="Enter language code",
 
344
  visible=False,
345
  )
346
 
 
347
  def toggle_debug(debug_enabled):
348
  return gr.update(visible=debug_enabled)
349
 
 
353
  outputs=[debug_output]
354
  )
355
 
 
356
  transcribe_btn.click(
357
  transcribe,
358
  inputs=[audio_input, model_selector, debug_checkbox, pause_threshold_slider, vocal_extraction_checkbox, language_input],