Spaces:
Running
Running
| import gradio as gr | |
| import whisper | |
| import torch | |
| import os | |
| import numpy as np | |
| from pydub import AudioSegment, silence | |
| from faster_whisper import WhisperModel # Import faster-whisper | |
| import noisereduce as nr # Import noisereduce for background noise removal | |
| from spleeter.separator import Separator # Import Spleeter for music separation | |
| # Mapping of model names to Whisper model sizes | |
| MODELS = { | |
| "Tiny (Fastest)": "tiny", | |
| "Base (Faster)": "base", | |
| "Small (Balanced)": "small", | |
| "Medium (Accurate)": "medium", | |
| "Large (Most Accurate)": "large", | |
| "Faster Whisper Large v3": "Systran/faster-whisper-large-v3" # Renamed and set as default | |
| } | |
| # Mapping of full language names to language codes | |
| LANGUAGE_NAME_TO_CODE = { | |
| "Auto Detect": "Auto Detect", | |
| "English": "en", | |
| "Chinese": "zh", | |
| "German": "de", | |
| "Spanish": "es", | |
| "Russian": "ru", | |
| "Korean": "ko", | |
| "French": "fr", | |
| "Japanese": "ja", | |
| "Portuguese": "pt", | |
| "Turkish": "tr", | |
| "Polish": "pl", | |
| "Catalan": "ca", | |
| "Dutch": "nl", | |
| "Arabic": "ar", | |
| "Swedish": "sv", | |
| "Italian": "it", | |
| "Indonesian": "id", | |
| "Hindi": "hi", | |
| "Finnish": "fi", | |
| "Vietnamese": "vi", | |
| "Hebrew": "he", | |
| "Ukrainian": "uk", | |
| "Greek": "el", | |
| "Malay": "ms", | |
| "Czech": "cs", | |
| "Romanian": "ro", | |
| "Danish": "da", | |
| "Hungarian": "hu", | |
| "Tamil": "ta", | |
| "Norwegian": "no", | |
| "Thai": "th", | |
| "Urdu": "ur", | |
| "Croatian": "hr", | |
| "Bulgarian": "bg", | |
| "Lithuanian": "lt", | |
| "Latin": "la", | |
| "Maori": "mi", | |
| "Malayalam": "ml", | |
| "Welsh": "cy", | |
| "Slovak": "sk", | |
| "Telugu": "te", | |
| "Persian": "fa", | |
| "Latvian": "lv", | |
| "Bengali": "bn", | |
| "Serbian": "sr", | |
| "Azerbaijani": "az", | |
| "Slovenian": "sl", | |
| "Kannada": "kn", | |
| "Estonian": "et", | |
| "Macedonian": "mk", | |
| "Breton": "br", | |
| "Basque": "eu", | |
| "Icelandic": "is", | |
| "Armenian": "hy", | |
| "Nepali": "ne", | |
| "Mongolian": "mn", | |
| "Bosnian": "bs", | |
| "Kazakh": "kk", | |
| "Albanian": "sq", | |
| "Swahili": "sw", | |
| "Galician": "gl", | |
| "Marathi": "mr", | |
| "Punjabi": "pa", | |
| "Sinhala": "si", # Sinhala support | |
| "Khmer": "km", | |
| "Shona": "sn", | |
| "Yoruba": "yo", | |
| "Somali": "so", | |
| "Afrikaans": "af", | |
| "Occitan": "oc", | |
| "Georgian": "ka", | |
| "Belarusian": "be", | |
| "Tajik": "tg", | |
| "Sindhi": "sd", | |
| "Gujarati": "gu", | |
| "Amharic": "am", | |
| "Yiddish": "yi", | |
| "Lao": "lo", | |
| "Uzbek": "uz", | |
| "Faroese": "fo", | |
| "Haitian Creole": "ht", | |
| "Pashto": "ps", | |
| "Turkmen": "tk", | |
| "Nynorsk": "nn", | |
| "Maltese": "mt", | |
| "Sanskrit": "sa", | |
| "Luxembourgish": "lb", | |
| "Burmese": "my", | |
| "Tibetan": "bo", | |
| "Tagalog": "tl", | |
| "Malagasy": "mg", | |
| "Assamese": "as", | |
| "Tatar": "tt", | |
| "Hawaiian": "haw", | |
| "Lingala": "ln", | |
| "Hausa": "ha", | |
| "Bashkir": "ba", | |
| "Javanese": "jw", | |
| "Sundanese": "su", | |
| } | |
| # Reverse mapping of language codes to full language names | |
| CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()} | |
| def detect_language(audio_file): | |
| """Detect the language of the audio file.""" | |
| # Define device and compute type for faster-whisper | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| compute_type = "float32" if device == "cuda" else "int8" | |
| # Load the faster-whisper model for language detection | |
| model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type) | |
| # Convert audio to 16kHz mono for better compatibility | |
| audio = AudioSegment.from_file(audio_file) | |
| audio = audio.set_frame_rate(16000).set_channels(1) | |
| processed_audio_path = "processed_audio.wav" | |
| audio.export(processed_audio_path, format="wav") | |
| # Detect the language using faster-whisper | |
| segments, info = model.transcribe(processed_audio_path, task="translate", language=None) | |
| detected_language_code = info.language | |
| # Get the full language name from the code | |
| detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language") | |
| # Clean up processed audio file | |
| os.remove(processed_audio_path) | |
| return f"Detected Language: {detected_language}" | |
| def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500): | |
| """ | |
| Remove silence from the audio file using AI-based silence detection. | |
| Args: | |
| audio_file (str): Path to the input audio file. | |
| silence_threshold (int): Silence threshold in dB. Default is -40 dB. | |
| min_silence_len (int): Minimum length of silence to remove in milliseconds. Default is 500 ms. | |
| Returns: | |
| str: Path to the output audio file with silence removed. | |
| """ | |
| # Load the audio file | |
| audio = AudioSegment.from_file(audio_file) | |
| # Detect silent chunks | |
| silent_chunks = silence.detect_silence( | |
| audio, | |
| min_silence_len=min_silence_len, | |
| silence_thresh=silence_threshold | |
| ) | |
| # Remove silent chunks | |
| non_silent_audio = AudioSegment.empty() | |
| start = 0 | |
| for chunk in silent_chunks: | |
| non_silent_audio += audio[start:chunk[0]] # Add non-silent part | |
| start = chunk[1] # Move to the end of the silent chunk | |
| non_silent_audio += audio[start:] # Add the remaining part | |
| # Export the processed audio | |
| output_path = "silence_removed_audio.wav" | |
| non_silent_audio.export(output_path, format="wav") | |
| return output_path | |
| def remove_background_noise(audio_file, noise_reduce_level=0.5): | |
| """ | |
| Remove background noise from the audio file using AI-based noise reduction. | |
| Args: | |
| audio_file (str): Path to the input audio file. | |
| noise_reduce_level (float): Noise reduction level (0.0 to 1.0). Default is 0.5. | |
| Returns: | |
| str: Path to the output audio file with background noise removed. | |
| """ | |
| # Load the audio file | |
| audio = AudioSegment.from_file(audio_file) | |
| # Convert audio to numpy array for noisereduce | |
| samples = np.array(audio.get_array_of_samples()) | |
| sample_rate = audio.frame_rate | |
| # Perform noise reduction | |
| reduced_noise = nr.reduce_noise( | |
| y=samples, | |
| sr=sample_rate, | |
| prop_decrease=noise_reduce_level | |
| ) | |
| # Convert back to AudioSegment | |
| reduced_audio = AudioSegment( | |
| reduced_noise.tobytes(), | |
| frame_rate=sample_rate, | |
| sample_width=audio.sample_width, | |
| channels=audio.channels | |
| ) | |
| # Export the processed audio | |
| output_path = "noise_reduced_audio.wav" | |
| reduced_audio.export(output_path, format="wav") | |
| return output_path | |
| def remove_background_music(audio_file): | |
| """ | |
| Remove background music from the audio file using Spleeter. | |
| Args: | |
| audio_file (str): Path to the input audio file. | |
| Returns: | |
| str: Path to the output audio file with background music removed. | |
| """ | |
| # Initialize Spleeter separator (2 stems: vocals and accompaniment) | |
| separator = Separator('spleeter:2stems') | |
| # Separate vocals from background music | |
| separator.separate_to_file(audio_file, "output") | |
| # Load the separated vocals | |
| output_path = os.path.join("output", os.path.basename(audio_file).replace(".wav", ""), "vocals.wav") | |
| return output_path | |
| def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"): | |
| """Transcribe the audio file.""" | |
| # Convert audio to 16kHz mono for better compatibility | |
| audio = AudioSegment.from_file(audio_file) | |
| audio = audio.set_frame_rate(16000).set_channels(1) | |
| processed_audio_path = "processed_audio.wav" | |
| audio.export(processed_audio_path, format="wav") | |
| # Load the appropriate model | |
| if model_size == "Faster Whisper Large v3": | |
| # Define device and compute type for faster-whisper | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| compute_type = "float32" if device == "cuda" else "int8" | |
| # Use faster-whisper for the Systran model | |
| model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type) | |
| segments, info = model.transcribe( | |
| processed_audio_path, | |
| task="transcribe", | |
| word_timestamps=True, | |
| repetition_penalty=1.1, | |
| temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0], | |
| ) | |
| transcription = " ".join([segment.text for segment in segments]) | |
| detected_language_code = info.language | |
| detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language") | |
| else: | |
| # Use the standard Whisper model | |
| model = whisper.load_model(MODELS[model_size]) | |
| # Transcribe the audio | |
| if language == "Auto Detect": | |
| result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language | |
| detected_language_code = result.get("language", "unknown") | |
| detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language") | |
| else: | |
| language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found | |
| result = model.transcribe(processed_audio_path, language=language_code, fp16=False) | |
| detected_language = language | |
| transcription = result["text"] | |
| # Clean up processed audio file | |
| os.remove(processed_audio_path) | |
| # Return transcription and detected language | |
| return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}" | |
| # Define the Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Audio Transcription and Language Detection") | |
| with gr.Tab("Detect Language"): | |
| gr.Markdown("Upload an audio file to detect its language.") | |
| detect_audio_input = gr.Audio(type="filepath", label="Upload Audio File") | |
| detect_language_output = gr.Textbox(label="Detected Language") | |
| detect_button = gr.Button("Detect Language") | |
| with gr.Tab("Transcribe Audio"): | |
| gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.") | |
| transcribe_audio_input = gr.Audio(type="filepath", label="Upload Audio File") | |
| language_dropdown = gr.Dropdown( | |
| choices=list(LANGUAGE_NAME_TO_CODE.keys()), # Full language names | |
| label="Select Language", | |
| value="Auto Detect" | |
| ) | |
| model_dropdown = gr.Dropdown( | |
| choices=list(MODELS.keys()), # Model options | |
| label="Select Model", | |
| value="Faster Whisper Large v3", # Default to "Faster Whisper Large v3" | |
| interactive=True # Allow model selection by default | |
| ) | |
| transcribe_output = gr.Textbox(label="Transcription and Detected Language") | |
| transcribe_button = gr.Button("Transcribe Audio") | |
| with gr.Tab("Remove Silence"): | |
| gr.Markdown("Upload an audio file to remove silence.") | |
| silence_audio_input = gr.Audio(type="filepath", label="Upload Audio File") | |
| silence_threshold_slider = gr.Slider( | |
| minimum=-60, maximum=-20, value=-40, step=1, | |
| label="Silence Threshold (dB)", | |
| info="Lower values detect quieter sounds as silence." | |
| ) | |
| min_silence_len_slider = gr.Slider( | |
| minimum=100, maximum=2000, value=500, step=100, | |
| label="Minimum Silence Length (ms)", | |
| info="Minimum duration of silence to remove." | |
| ) | |
| silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath") | |
| silence_button = gr.Button("Remove Silence") | |
| with gr.Tab("Remove Background Noise"): | |
| gr.Markdown("Upload an audio file to remove background noise.") | |
| noise_audio_input = gr.Audio(type="filepath", label="Upload Audio File") | |
| noise_reduce_slider = gr.Slider( | |
| minimum=0.0, maximum=1.0, value=0.5, step=0.1, | |
| label="Noise Reduction Level", | |
| info="Higher values remove more noise." | |
| ) | |
| noise_output = gr.Audio(label="Processed Audio (Noise Removed)", type="filepath") | |
| noise_button = gr.Button("Remove Background Noise") | |
| with gr.Tab("Remove Background Music"): | |
| gr.Markdown("Upload an audio file to remove background music.") | |
| music_audio_input = gr.Audio(type="filepath", label="Upload Audio File") | |
| music_output = gr.Audio(label="Processed Audio (Music Removed)", type="filepath") | |
| music_button = gr.Button("Remove Background Music") | |
| # Link buttons to functions | |
| detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output) | |
| transcribe_button.click( | |
| transcribe_audio, | |
| inputs=[transcribe_audio_input, language_dropdown, model_dropdown], | |
| outputs=transcribe_output | |
| ) | |
| silence_button.click( | |
| remove_silence, | |
| inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider], | |
| outputs=silence_output | |
| ) | |
| noise_button.click( | |
| remove_background_noise, | |
| inputs=[noise_audio_input, noise_reduce_slider], | |
| outputs=noise_output | |
| ) | |
| music_button.click( | |
| remove_background_music, | |
| inputs=music_audio_input, | |
| outputs=music_output | |
| ) | |
| # Launch the Gradio interface | |
| demo.launch() |