Spaces:

Athspi
/

Ai-audio

Sleeping

App Files Files Community

Athspi commited on Jan 11

Commit

7d07125

verified ·

1 Parent(s): 5989272

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -61

app.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import gradio as gr
 import whisper
 import os
 from pydub import AudioSegment
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
-import torch
-import librosa
-import numpy as np
 # Mapping of model names to Whisper model sizes
 MODELS = {
@@ -16,8 +14,14 @@ MODELS = {
     "Large (Most Accurate)": "large"
 }
-# Fine-tuned Sinhala model
-SINHALA_MODEL = "Subhaka/whisper-small-Sinhala-Fine_Tune"
 # Mapping of full language names to language codes
 LANGUAGE_NAME_TO_CODE = {
@@ -123,68 +127,41 @@ LANGUAGE_NAME_TO_CODE = {
     "Sundanese": "su",
 }
-# Preload the fine-tuned Sinhala model and processor
-processor = AutoProcessor.from_pretrained(SINHALA_MODEL)
-sinhala_model = AutoModelForSpeechSeq2Seq.from_pretrained(SINHALA_MODEL)
-# Move model to GPU if available
-device = "cuda" if torch.cuda.is_available() else "cpu"
-sinhala_model.to(device)
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
     """Transcribe the audio file."""
-    # Load the appropriate model
-    if language == "Sinhala":
-        # Use the fine-tuned Sinhala model
-        model = sinhala_model
-        model_processor = processor
-    else:
-        # Use the selected Whisper model
-        model = whisper.load_model(MODELS[model_size])
-        model_processor = None
-    # Convert audio to 16kHz mono for better compatibility with Whisper
     audio = AudioSegment.from_file(audio_file)
     audio = audio.set_frame_rate(16000).set_channels(1)
     processed_audio_path = "processed_audio.wav"
     audio.export(processed_audio_path, format="wav")
-    # Transcribe the audio
-    if language == "Auto Detect":
-        if model_processor:
-            # Load the audio as a NumPy array
-            raw_audio, _ = librosa.load(processed_audio_path, sr=16000)
-            raw_audio = np.array(raw_audio, dtype=np.float32)
-            # Process the audio and generate transcription
-            inputs = model_processor(raw_audio, return_tensors="pt", sampling_rate=16000).input_features.to(device)
-            with torch.no_grad():
-                generated_ids = model.generate(inputs)
-            transcription = model_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            detected_language = "si"
-        else:
-            # Use Whisper for auto-detection
-            result = model.transcribe(processed_audio_path, fp16=(device == "cuda"))
-            transcription = result["text"]
-            detected_language = result.get("language", "unknown")
     else:
-        if model_processor:
-            # Load the audio as a NumPy array
-            raw_audio, _ = librosa.load(processed_audio_path, sr=16000)
-            raw_audio = np.array(raw_audio, dtype=np.float32)
-            # Process the audio and generate transcription
-            inputs = model_processor(raw_audio, return_tensors="pt", sampling_rate=16000).input_features.to(device)
-            with torch.no_grad():
-                generated_ids = model.generate(inputs)
-            transcription = model_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            detected_language = "si"
         else:
-            # Use Whisper for transcription with the selected language
-            language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")
-            result = model.transcribe(processed_audio_path, language=language_code, fp16=(device == "cuda"))
-            transcription = result["text"]
             detected_language = language_code
     # Clean up processed audio file
     os.remove(processed_audio_path)
@@ -194,7 +171,7 @@ def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faste
 # Define the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Audio Transcription and Language Detection")
     with gr.Tab("Transcribe Audio"):
         gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.")
@@ -215,8 +192,8 @@ with gr.Blocks() as demo:
     # Update model dropdown based on language selection
     def update_model_dropdown(language):
-        if language == "Sinhala":
-            return gr.Dropdown(interactive=False, value="Base (Faster)")  # Set a valid value
         else:
             return gr.Dropdown(choices=list(MODELS.keys()), interactive=True, value="Base (Faster)")

 import gradio as gr
 import whisper
+import torch
 import os
 from pydub import AudioSegment
+from transformers import AutoProcessor, AutoModelForCTC
 # Mapping of model names to Whisper model sizes
 MODELS = {
     "Large (Most Accurate)": "large"
 }
+# Fine-tuned Wav2Vec2 models for specific languages
+WAV2VEC2_MODELS = {
+    "Tamil": {
+        "processor": "Amrrs/wav2vec2-large-xlsr-53-tamil",
+        "model": "Amrrs/wav2vec2-large-xlsr-53-tamil"
+    },
+    # Add more Wav2Vec2 models for other languages here
+}
 # Mapping of full language names to language codes
 LANGUAGE_NAME_TO_CODE = {
     "Sundanese": "su",
 }
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
     """Transcribe the audio file."""
+    # Convert audio to 16kHz mono for better compatibility
     audio = AudioSegment.from_file(audio_file)
     audio = audio.set_frame_rate(16000).set_channels(1)
     processed_audio_path = "processed_audio.wav"
     audio.export(processed_audio_path, format="wav")
+    # Load the appropriate model
+    if language in WAV2VEC2_MODELS:
+        # Use the fine-tuned Wav2Vec2 model for the selected language
+        processor = AutoProcessor.from_pretrained(WAV2VEC2_MODELS[language]["processor"])
+        model = AutoModelForCTC.from_pretrained(WAV2VEC2_MODELS[language]["model"])
+        # Load audio and process
+        inputs = processor(AudioSegment.from_file(processed_audio_path).raw_data, sampling_rate=16000, return_tensors="pt")
+        with torch.no_grad():
+            logits = model(inputs.input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.decode(predicted_ids[0])
+        detected_language = language
     else:
+        # Use the selected Whisper model
+        model = whisper.load_model(MODELS[model_size])
+        # Transcribe the audio
+        if language == "Auto Detect":
+            result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
+            detected_language = result.get("language", "unknown")
         else:
+            language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
+            result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
             detected_language = language_code
+        transcription = result["text"]
     # Clean up processed audio file
     os.remove(processed_audio_path)
 # Define the Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Audio Transcription with Fine-Tuned Models")
     with gr.Tab("Transcribe Audio"):
         gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.")
     # Update model dropdown based on language selection
     def update_model_dropdown(language):
+        if language in WAV2VEC2_MODELS:
+            return gr.Dropdown(interactive=False, value=f"Fine-Tuned {language} Model")
         else:
             return gr.Dropdown(choices=list(MODELS.keys()), interactive=True, value="Base (Faster)")