Spaces:

Athspi
/

Ai-audio

Sleeping

App Files Files Community

Athspi commited on Jan 12

Commit

60d2e23

verified ·

1 Parent(s): dbffdf4

Update app.py

Browse files

Files changed (1) hide show

app.py +233 -83

app.py CHANGED Viewed

@@ -3,7 +3,16 @@ import whisper
 import torch
 import os
 from pydub import AudioSegment, silence
-from faster_whisper import WhisperModel  # Import faster-whisper
 # Mapping of model names to Whisper model sizes
 MODELS = {
@@ -122,32 +131,48 @@ LANGUAGE_NAME_TO_CODE = {
 # Reverse mapping of language codes to full language names
 CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
 def detect_language(audio_file):
     """Detect the language of the audio file."""
-    # Define device and compute type for faster-whisper
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    compute_type = "float32" if device == "cuda" else "int8"
-    # Load the faster-whisper model for language detection
-    model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
-    # Convert audio to 16kHz mono for better compatibility
-    audio = AudioSegment.from_file(audio_file)
-    audio = audio.set_frame_rate(16000).set_channels(1)
-    processed_audio_path = "processed_audio.wav"
-    audio.export(processed_audio_path, format="wav")
-    # Detect the language using faster-whisper
-    segments, info = model.transcribe(processed_audio_path, task="translate", language=None)
-    detected_language_code = info.language
-    # Get the full language name from the code
-    detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
-    # Clean up processed audio file
-    os.remove(processed_audio_path)
-    return f"Detected Language: {detected_language}"
 def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
     """
@@ -161,81 +186,188 @@ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
     Returns:
         str: Path to the output audio file with silence removed.
     """
-    # Load the audio file
-    audio = AudioSegment.from_file(audio_file)
-    # Detect silent chunks
-    silent_chunks = silence.detect_silence(
-        audio,
-        min_silence_len=min_silence_len,
-        silence_thresh=silence_threshold
-    )
-    # Remove silent chunks
-    non_silent_audio = AudioSegment.empty()
-    start = 0
-    for chunk in silent_chunks:
-        non_silent_audio += audio[start:chunk[0]]  # Add non-silent part
-        start = chunk[1]  # Move to the end of the silent chunk
-    non_silent_audio += audio[start:]  # Add the remaining part
-    # Export the processed audio
-    output_path = "silence_removed_audio.wav"
-    non_silent_audio.export(output_path, format="wav")
-    return output_path
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
     """Transcribe the audio file."""
-    # Convert audio to 16kHz mono for better compatibility
-    audio = AudioSegment.from_file(audio_file)
-    audio = audio.set_frame_rate(16000).set_channels(1)
-    processed_audio_path = "processed_audio.wav"
-    audio.export(processed_audio_path, format="wav")
-    # Load the appropriate model
-    if model_size == "Faster Whisper Large v3":
-        # Define device and compute type for faster-whisper
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        compute_type = "float32" if device == "cuda" else "int8"
-        # Use faster-whisper for the Systran model
-        model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
-        segments, info = model.transcribe(
-            processed_audio_path,
-            task="transcribe",
-            word_timestamps=True,
-            repetition_penalty=1.1,
-            temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
-        )
-        transcription = " ".join([segment.text for segment in segments])
-        detected_language_code = info.language
-        detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
-    else:
-        # Use the standard Whisper model
-        model = whisper.load_model(MODELS[model_size])
-        # Transcribe the audio
-        if language == "Auto Detect":
-            result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
-            detected_language_code = result.get("language", "unknown")
             detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
         else:
-            language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
-            result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
-            detected_language = language
-        transcription = result["text"]
-    # Clean up processed audio file
-    os.remove(processed_audio_path)
-    # Return transcription and detected language
-    return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
 # Define the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Audio Transcription and Language Detector")  # Updated title
     with gr.Tab("Detect Language"):
         gr.Markdown("Upload an audio file to detect its language.")
@@ -276,6 +408,19 @@ with gr.Blocks() as demo:
         silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
         silence_button = gr.Button("Remove Silence")
     # Link buttons to functions
     detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
     transcribe_button.click(
@@ -288,6 +433,11 @@ with gr.Blocks() as demo:
         inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
         outputs=silence_output
     )
 # Launch the Gradio interface
 demo.launch()

 import torch
 import os
 from pydub import AudioSegment, silence
+from faster_whisper import WhisperModel
+import numpy as np
+from scipy.io import wavfile
+from scipy.signal import correlate
+import tempfile
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Mapping of model names to Whisper model sizes
 MODELS = {
 # Reverse mapping of language codes to full language names
 CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
+def convert_to_wav(audio_file):
+    """Convert any audio file to WAV format."""
+    audio = AudioSegment.from_file(audio_file)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
+        wav_path = temp_wav.name
+        audio.export(wav_path, format="wav")
+    return wav_path
+def resample_audio(audio_segment, target_sample_rate):
+    """Resample an audio segment to the target sample rate."""
+    return audio_segment.set_frame_rate(target_sample_rate)
 def detect_language(audio_file):
     """Detect the language of the audio file."""
+    if audio_file is None:
+        return "Error: No audio file uploaded."
+    try:
+        # Convert audio to WAV format
+        wav_path = convert_to_wav(audio_file)
+        # Define device and compute type for faster-whisper
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        compute_type = "float32" if device == "cuda" else "int8"
+        # Load the faster-whisper model for language detection
+        model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
+        # Detect the language using faster-whisper
+        segments, info = model.transcribe(wav_path, task="translate", language=None)
+        detected_language_code = info.language
+        # Get the full language name from the code
+        detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
+        # Clean up temporary WAV file
+        os.remove(wav_path)
+        return f"Detected Language: {detected_language}"
+    except Exception as e:
+        logger.error(f"Error in detect_language: {str(e)}")
+        return f"Error: {str(e)}"
 def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
     """
     Returns:
         str: Path to the output audio file with silence removed.
     """
+    if audio_file is None:
+        return None
+    try:
+        # Convert audio to WAV format
+        wav_path = convert_to_wav(audio_file)
+        # Load the audio file
+        audio = AudioSegment.from_file(wav_path)
+        # Detect silent chunks
+        silent_chunks = silence.detect_silence(
+            audio,
+            min_silence_len=min_silence_len,
+            silence_thresh=silence_threshold
+        )
+        # Remove silent chunks
+        non_silent_audio = AudioSegment.empty()
+        start = 0
+        for chunk in silent_chunks:
+            non_silent_audio += audio[start:chunk[0]]  # Add non-silent part
+            start = chunk[1]  # Move to the end of the silent chunk
+        non_silent_audio += audio[start:]  # Add the remaining part
+        # Export the processed audio
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
+            output_path = temp_output.name
+            non_silent_audio.export(output_path, format="wav")
+        # Clean up temporary WAV file
+        os.remove(wav_path)
+        return output_path
+    except Exception as e:
+        logger.error(f"Error in remove_silence: {str(e)}")
+        return f"Error: {str(e)}"
+def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
+    """
+    Detect the target audio in the main audio and trim the main audio to include only the detected segments.
+    Args:
+        main_audio (str): Path to the main audio file.
+        target_audio (str): Path to the target audio file.
+        threshold (float): Detection threshold (0 to 1). Higher values mean stricter detection.
+    Returns:
+        str: Path to the trimmed audio file.
+        str: Detected timestamps in the format "start-end (in seconds)".
+    """
+    if main_audio is None or target_audio is None:
+        return None, "Error: Please upload both main and target audio files."
+    try:
+        # Convert audio files to WAV format
+        main_wav_path = convert_to_wav(main_audio)
+        target_wav_path = convert_to_wav(target_audio)
+        # Load audio files
+        main_rate, main_data = wavfile.read(main_wav_path)
+        target_rate, target_data = wavfile.read(target_wav_path)
+        # Ensure both audio files have the same sample rate
+        if main_rate != target_rate:
+            logger.warning(f"Sample rates differ: main_audio={main_rate}, target_audio={target_rate}. Resampling target audio.")
+            target_segment = AudioSegment.from_file(target_wav_path)
+            target_segment = resample_audio(target_segment, main_rate)
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_resampled:
+                resampled_path = temp_resampled.name
+                target_segment.export(resampled_path, format="wav")
+            target_rate, target_data = wavfile.read(resampled_path)
+        # Normalize audio data
+        main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
+        target_data = target_data.astype(np.float32) / np.iinfo(target_data.dtype).max
+        # Perform cross-correlation to detect the target audio in the main audio
+        correlation = correlate(main_data, target_data, mode='valid')
+        correlation = np.abs(correlation)
+        max_corr = np.max(correlation)
+        # Find the peak in the cross-correlation result
+        peak_index = np.argmax(correlation)
+        peak_value = correlation[peak_index]
+        # Check if the peak value exceeds the threshold
+        if peak_value < threshold * max_corr:
+            return None, "Error: Target audio not detected in the main audio."
+        # Calculate the start and end times of the target audio in the main audio
+        start_time = peak_index / main_rate
+        end_time = (peak_index + len(target_data)) / main_rate
+        # Trim the main audio to include only the detected segment
+        main_audio_segment = AudioSegment.from_file(main_wav_path)
+        start_ms = int(start_time * 1000)
+        end_ms = int(end_time * 1000)
+        trimmed_audio = main_audio_segment[start_ms:end_ms]
+        # Export the trimmed audio
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
+            output_path = temp_output.name
+            trimmed_audio.export(output_path, format="wav")
+        # Format timestamps
+        timestamps_str = f"{start_time:.2f}-{end_time:.2f}"
+        # Clean up temporary WAV files
+        os.remove(main_wav_path)
+        os.remove(target_wav_path)
+        if 'resampled_path' in locals():
+            os.remove(resampled_path)
+        return output_path, timestamps_str
+    except Exception as e:
+        logger.error(f"Error in detect_and_trim_audio: {str(e)}")
+        return None, f"Error: {str(e)}"
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
     """Transcribe the audio file."""
+    if audio_file is None:
+        return "Error: No audio file uploaded."
+    try:
+        # Convert audio to WAV format
+        wav_path = convert_to_wav(audio_file)
+        # Convert audio to 16kHz mono for better compatibility
+        audio = AudioSegment.from_file(wav_path)
+        audio = audio.set_frame_rate(16000).set_channels(1)
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_processed:
+            processed_audio_path = temp_processed.name
+            audio.export(processed_audio_path, format="wav")
+        # Load the appropriate model
+        if model_size == "Faster Whisper Large v3":
+            # Define device and compute type for faster-whisper
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            compute_type = "float32" if device == "cuda" else "int8"
+            # Use faster-whisper for the Systran model
+            model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
+            segments, info = model.transcribe(
+                processed_audio_path,
+                task="transcribe",
+                word_timestamps=True,
+                repetition_penalty=1.1,
+                temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
+            )
+            transcription = " ".join([segment.text for segment in segments])
+            detected_language_code = info.language
             detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
         else:
+            # Use the standard Whisper model
+            model = whisper.load_model(MODELS[model_size])
+            # Transcribe the audio
+            if language == "Auto Detect":
+                result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
+                detected_language_code = result.get("language", "unknown")
+                detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
+            else:
+                language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
+                result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
+                detected_language = language
+            transcription = result["text"]
+        # Clean up processed audio file
+        os.remove(processed_audio_path)
+        os.remove(wav_path)
+        # Return transcription and detected language
+        return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
+    except Exception as e:
+        logger.error(f"Error in transcribe_audio: {str(e)}")
+        return f"Error: {str(e)}"
 # Define the Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Audio Processing Tool")
     with gr.Tab("Detect Language"):
         gr.Markdown("Upload an audio file to detect its language.")
         silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
         silence_button = gr.Button("Remove Silence")
+    with gr.Tab("Detect and Trim Audio"):
+        gr.Markdown("Upload a main audio file and a target audio file. The app will detect the target audio in the main audio and trim it.")
+        main_audio_input = gr.Audio(type="filepath", label="Upload Main Audio File")
+        target_audio_input = gr.Audio(type="filepath", label="Upload Target Audio File")
+        threshold_slider = gr.Slider(
+            minimum=0.1, maximum=1.0, value=0.5, step=0.1,
+            label="Detection Threshold",
+            info="Higher values mean stricter detection."
+        )
+        trimmed_audio_output = gr.Audio(label="Trimmed Audio", type="filepath")
+        timestamps_output = gr.Textbox(label="Detected Timestamps (in seconds)")
+        detect_trim_button = gr.Button("Detect and Trim")
     # Link buttons to functions
     detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
     transcribe_button.click(
         inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
         outputs=silence_output
     )
+    detect_trim_button.click(
+        detect_and_trim_audio,
+        inputs=[main_audio_input, target_audio_input, threshold_slider],
+        outputs=[trimmed_audio_output, timestamps_output]
+    )
 # Launch the Gradio interface
 demo.launch()