Spaces:

Athspi
/

Ai-audio

Running

App Files Files Community

Athspi commited on Jan 12

Commit

c9c7876

verified ·

1 Parent(s): 8b5c488

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -47

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import gradio as gr
 import whisper
-import torch
 import os
 from pydub import AudioSegment
-from huggingsound import SpeechRecognitionModel
 # Mapping of model names to Whisper model sizes
 MODELS = {
@@ -14,12 +15,8 @@ MODELS = {
     "Large (Most Accurate)": "large"
 }
-# HuggingSound model for Arabic
-HUGGINGSOUND_MODEL = {
-    "Arabic": {
-        "model": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
-    }
-}
 # Mapping of full language names to language codes
 LANGUAGE_NAME_TO_CODE = {
@@ -87,7 +84,7 @@ LANGUAGE_NAME_TO_CODE = {
     "Galician": "gl",
     "Marathi": "mr",
     "Punjabi": "pa",
-    "Sinhala": "si",  # Sinhala support
     "Khmer": "km",
     "Shona": "sn",
     "Yoruba": "yo",
@@ -125,76 +122,92 @@ LANGUAGE_NAME_TO_CODE = {
     "Sundanese": "su",
 }
-def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
-    """Transcribe the audio file."""
-    # Convert audio to 16kHz mono for better compatibility
     audio = AudioSegment.from_file(audio_file)
     audio = audio.set_frame_rate(16000).set_channels(1)
     processed_audio_path = "processed_audio.wav"
     audio.export(processed_audio_path, format="wav")
-    # Load the appropriate model
-    if language in HUGGINGSOUND_MODEL:
-        # Use the HuggingSound model for the selected language
-        model = SpeechRecognitionModel(HUGGINGSOUND_MODEL[language]["model"])
-        transcriptions = model.transcribe([processed_audio_path])
-        transcription = transcriptions[0]["transcription"]
-        detected_language = language
     else:
-        # Use the selected Whisper model
-        model = whisper.load_model(MODELS[model_size])
-        # Transcribe the audio
-        if language == "Auto Detect":
-            result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
-            detected_language = result.get("language", "unknown")
-        else:
-            language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
-            result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
-            detected_language = language_code
-        transcription = result["text"]
     # Clean up processed audio file
     os.remove(processed_audio_path)
     # Return transcription and detected language
-    return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
 # Define the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Audio Transcription with HuggingSound and Whisper")
     with gr.Tab("Transcribe Audio"):
         gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.")
         transcribe_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
         language_dropdown = gr.Dropdown(
-            choices=list(LANGUAGE_NAME_TO_CODE.keys()),  # Full language names
             label="Select Language",
             value="Auto Detect"
         )
         model_dropdown = gr.Dropdown(
-            choices=list(MODELS.keys()),  # Model options
             label="Select Model",
-            value="Base (Faster)",  # Default to "Base" model
-            interactive=True  # Allow model selection by default
         )
         transcribe_output = gr.Textbox(label="Transcription and Detected Language")
         transcribe_button = gr.Button("Transcribe Audio")
     # Update model dropdown based on language selection
     def update_model_dropdown(language):
-        if language in HUGGINGSOUND_MODEL:
-            # Add "HuggingSound Model" to the dropdown choices and disable it
-            return gr.Dropdown(choices=["HuggingSound Model"], value="HuggingSound Model", interactive=False)
         else:
-            # Reset the dropdown to standard Whisper models
-            return gr.Dropdown(choices=list(MODELS.keys()), value="Base (Faster)", interactive=True)
     language_dropdown.change(update_model_dropdown, inputs=language_dropdown, outputs=model_dropdown)
-    # Link button to function
     transcribe_button.click(transcribe_audio, inputs=[transcribe_audio_input, language_dropdown, model_dropdown], outputs=transcribe_output)
 # Launch the Gradio interface
-demo.launch()

 import gradio as gr
 import whisper
 import os
 from pydub import AudioSegment
+from transformers import AutoProcessor, AutoModelForCTC
+import torchaudio
+import torch
 # Mapping of model names to Whisper model sizes
 MODELS = {
     "Large (Most Accurate)": "large"
 }
+# Fine-tuned Sinhala model (using Hugging Face Transformers)
+SINHALA_MODEL = "IAmNotAnanth/wav2vec2-large-xls-r-300m-sinhala"
 # Mapping of full language names to language codes
 LANGUAGE_NAME_TO_CODE = {
     "Galician": "gl",
     "Marathi": "mr",
     "Punjabi": "pa",
+    "Sinhala": "si",
     "Khmer": "km",
     "Shona": "sn",
     "Yoruba": "yo",
     "Sundanese": "su",
 }
+def transcribe_with_whisper(audio_file, language="Auto Detect", model_size="Base (Faster)"):
+    """Transcribe using OpenAI's Whisper models."""
+    model = whisper.load_model(MODELS[model_size])
+    # Convert audio to 16kHz mono for compatibility with Whisper
     audio = AudioSegment.from_file(audio_file)
     audio = audio.set_frame_rate(16000).set_channels(1)
     processed_audio_path = "processed_audio.wav"
     audio.export(processed_audio_path, format="wav")
+    # Transcribe the audio
+    if language == "Auto Detect":
+        result = model.transcribe(processed_audio_path, fp16=False)
+        detected_language = result.get("language", "unknown")
     else:
+        language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
+        result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
+        detected_language = language_code
     # Clean up processed audio file
     os.remove(processed_audio_path)
     # Return transcription and detected language
+    return f"Detected Language: {detected_language}\n\nTranscription:\n{result['text']}"
+def transcribe_with_sinhala_model(audio_file):
+    """Transcribe using the fine-tuned Sinhala Wav2Vec2 model."""
+    processor = AutoProcessor.from_pretrained(SINHALA_MODEL)
+    model = AutoModelForCTC.from_pretrained(SINHALA_MODEL)
+    # Convert audio to 16kHz mono
+    audio = AudioSegment.from_file(audio_file)
+    audio = audio.set_frame_rate(16000).set_channels(1)
+    processed_audio_path = "processed_audio.wav"
+    audio.export(processed_audio_path, format="wav")
+    # Load and process audio
+    audio_input, _ = torchaudio.load(processed_audio_path)
+    input_values = processor(audio_input.squeeze(), return_tensors="pt", sampling_rate=16000).input_values
+    logits = model(input_values).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    # Decode prediction
+    transcription = processor.batch_decode(predicted_ids)[0]
+    # Clean up processed audio file
+    os.remove(processed_audio_path)
+    return f"Transcription:\n{transcription}"
+def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
+    """Wrapper to select the correct transcription method."""
+    if language == "Sinhala":
+        return transcribe_with_sinhala_model(audio_file)
+    else:
+        return transcribe_with_whisper(audio_file, language, model_size)
 # Define the Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Audio Transcription and Language Detection")
     with gr.Tab("Transcribe Audio"):
         gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.")
         transcribe_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
         language_dropdown = gr.Dropdown(
+            choices=list(LANGUAGE_NAME_TO_CODE.keys()),
             label="Select Language",
             value="Auto Detect"
         )
         model_dropdown = gr.Dropdown(
+            choices=list(MODELS.keys()),
             label="Select Model",
+            value="Base (Faster)"
         )
         transcribe_output = gr.Textbox(label="Transcription and Detected Language")
         transcribe_button = gr.Button("Transcribe Audio")
     # Update model dropdown based on language selection
     def update_model_dropdown(language):
+        if language == "Sinhala":
+            return gr.Dropdown(interactive=False, value="Fine-Tuned Sinhala Model")
         else:
+            return gr.Dropdown(choices=list(MODELS.keys()), interactive=True, value="Base (Faster)")
     language_dropdown.change(update_model_dropdown, inputs=language_dropdown, outputs=model_dropdown)
     transcribe_button.click(transcribe_audio, inputs=[transcribe_audio_input, language_dropdown, model_dropdown], outputs=transcribe_output)
 # Launch the Gradio interface
+demo.launch()