Spaces:

sdafd
/

whisperx-test

Sleeping

App Files Files Community

sdafd commited on Feb 26

Commit

ed7cca2

verified ·

1 Parent(s): 7a3ea68

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -20

app.py CHANGED Viewed

@@ -56,7 +56,7 @@ def get_vocals(input_file):
             'data': [
                 {
                     'path': json_data[0],
-                    'url': 'https://politrees-audio-separator-uvr.hf.space/gradio_api/file='+json_data[0],
                     'orig_name': pathlib.Path(input_file).name,
                     'size': file_len,
                     'mime_type': 'audio/wav',
@@ -135,17 +135,30 @@ def get_vocals(input_file):
         return None
 # -------------------------------
-# Normalization Function
 # -------------------------------
-def normalize_audio(audio, threshold_ratio=0.6):
     """
-    Given an audio signal (numpy array), set to 0 any samples that are below
-    a given ratio of the maximum absolute amplitude. This is a simple way to
-    suppress relatively quieter (background) parts.
     """
-    max_val = np.max(np.abs(audio))
-    threshold = threshold_ratio * max_val
-    normalized_audio = np.where(np.abs(audio) >= threshold, audio, 0)
     return normalized_audio
 # -------------------------------
@@ -207,7 +220,6 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
                 debug_log.append("Vocal extraction succeeded; downloading extracted audio...")
                 response = requests.get(extracted_url)
                 if response.status_code == 200:
-                    # Write to a temporary file
                     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
                         tmp.write(response.content)
                         audio_file = tmp.name
@@ -221,26 +233,26 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
         audio, sr = librosa.load(audio_file, sr=16000)
         debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
-        # If we used vocal extraction, apply normalization to remove low-amplitude (background) parts
         if vocal_extraction:
-            audio = normalize_audio(audio)
-            debug_log.append("Normalization applied to extracted audio to remove low-amplitude segments.")
         # Select the model and set batch size
         model = models[model_size]
         batch_size = 8 if model_size == "tiny" else 4
-        # Use the provided language if set; otherwise, let the model detect the language.
         if language:
             transcript = model.transcribe(audio, batch_size=batch_size, language=language)
         else:
             transcript = model.transcribe(audio, batch_size=batch_size)
             language = transcript.get("language", "unknown")
-        # Load alignment model using the specified/overridden language
         model_a, metadata = whisperx.load_align_model(language_code=language, device=device)
-        # If pause_threshold > 0, split the audio and process segments individually
         if pause_threshold > 0:
             segments = split_audio_by_pause(audio, sr, pause_threshold)
             debug_log.append(f"Audio split into {len(segments)} segment(s) using a pause threshold of {pause_threshold}s")
@@ -307,12 +319,10 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
                 interactive=True,
                 info="Set a pause duration threshold. Audio pauses longer than this will be used to split the audio into segments."
             )
-            # New input for vocal extraction feature
             vocal_extraction_checkbox = gr.Checkbox(
                 label="Extract Vocals (improves accuracy on noisy audio)",
                 value=False
             )
-            # New language selection (default English)
             language_input = gr.Textbox(
                 label="Language Code (e.g., en, es, fr)",
                 placeholder="Enter language code",
@@ -334,7 +344,6 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
                 visible=False,
             )
-    # Toggle debug visibility
     def toggle_debug(debug_enabled):
         return gr.update(visible=debug_enabled)
@@ -344,7 +353,6 @@ with gr.Blocks(title="WhisperX CPU Transcription") as demo:
         outputs=[debug_output]
     )
-    # Process transcription with all new parameters
     transcribe_btn.click(
         transcribe,
         inputs=[audio_input, model_selector, debug_checkbox, pause_threshold_slider, vocal_extraction_checkbox, language_input],

             'data': [
                 {
                     'path': json_data[0],
+                    'url': 'https://politrees-audio-separator-uvr.hf.space/gradio_api/file=' + json_data[0],
                     'orig_name': pathlib.Path(input_file).name,
                     'size': file_len,
                     'mime_type': 'audio/wav',
         return None
 # -------------------------------
+# Advanced Normalization Function
 # -------------------------------
+def advanced_normalize_audio(audio, threshold_ratio=0.6, window_size=1024):
     """
+    This advanced normalization function computes a moving-average envelope of the absolute
+    audio signal using a specified window size. It then zeroes out portions of the signal
+    where the envelope falls below a threshold (defined as a ratio of the maximum envelope value).
+    Parameters:
+        audio (np.ndarray): Input audio signal.
+        threshold_ratio (float): Ratio (0-1) to determine the minimum envelope value to keep.
+        window_size (int): Size of the moving window used to compute the envelope.
+    Returns:
+        np.ndarray: The normalized audio signal.
     """
+    # Compute moving-average envelope
+    envelope = np.convolve(np.abs(audio), np.ones(window_size) / window_size, mode='same')
+    max_env = np.max(envelope)
+    threshold = threshold_ratio * max_env
+    # Create a mask: keep samples where the envelope meets or exceeds the threshold.
+    mask = envelope >= threshold
+    # Optionally, you might smooth the mask further to avoid abrupt cuts.
+    normalized_audio = audio * mask.astype(audio.dtype)
     return normalized_audio
 # -------------------------------
                 debug_log.append("Vocal extraction succeeded; downloading extracted audio...")
                 response = requests.get(extracted_url)
                 if response.status_code == 200:
                     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
                         tmp.write(response.content)
                         audio_file = tmp.name
         audio, sr = librosa.load(audio_file, sr=16000)
         debug_log.append(f"Audio loaded: {len(audio)/sr:.2f} seconds long at {sr} Hz")
+        # If vocal extraction was used, apply advanced normalization
         if vocal_extraction:
+            audio = advanced_normalize_audio(audio)
+            debug_log.append("Advanced normalization applied to extracted audio to remove low-amplitude segments.")
         # Select the model and set batch size
         model = models[model_size]
         batch_size = 8 if model_size == "tiny" else 4
+        # Use provided language if set; otherwise, use language detection.
         if language:
             transcript = model.transcribe(audio, batch_size=batch_size, language=language)
         else:
             transcript = model.transcribe(audio, batch_size=batch_size)
             language = transcript.get("language", "unknown")
+        # Load alignment model using the specified language
         model_a, metadata = whisperx.load_align_model(language_code=language, device=device)
+        # If pause_threshold > 0, split audio and process segments individually
         if pause_threshold > 0:
             segments = split_audio_by_pause(audio, sr, pause_threshold)
             debug_log.append(f"Audio split into {len(segments)} segment(s) using a pause threshold of {pause_threshold}s")
                 interactive=True,
                 info="Set a pause duration threshold. Audio pauses longer than this will be used to split the audio into segments."
             )
             vocal_extraction_checkbox = gr.Checkbox(
                 label="Extract Vocals (improves accuracy on noisy audio)",
                 value=False
             )
             language_input = gr.Textbox(
                 label="Language Code (e.g., en, es, fr)",
                 placeholder="Enter language code",
                 visible=False,
             )
     def toggle_debug(debug_enabled):
         return gr.update(visible=debug_enabled)
         outputs=[debug_output]
     )
     transcribe_btn.click(
         transcribe,
         inputs=[audio_input, model_selector, debug_checkbox, pause_threshold_slider, vocal_extraction_checkbox, language_input],