Spaces:

Athspi
/

Ai-audio

Running

App Files Files Community

Athspi commited on Jan 11

Commit

fce1940

verified ·

1 Parent(s): c53ccee

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -54

app.py CHANGED Viewed

@@ -3,50 +3,17 @@ import whisper
 import os
 from pydub import AudioSegment
-# Load the base Whisper model
-base_model = whisper.load_model("base")  # Default model for non-Sinhala languages
-# Load the fine-tuned Sinhala model (if available)
-sinhala_model = None
-try:
-    from transformers import WhisperForConditionalGeneration, WhisperProcessor
-    sinhala_model = WhisperForConditionalGeneration.from_pretrained("Subhaka/whisper-small-Sinhala-Fine_Tune")
-    sinhala_processor = WhisperProcessor.from_pretrained("Subhaka/whisper-small-Sinhala-Fine_Tune")
-except Exception as e:
-    print("Failed to load fine-tuned Sinhala model. Falling back to the base model.")
-    print(f"Error: {e}")
-def transcribe_audio(audio_file, language="Auto Detect"):
-    # Convert audio to 16kHz mono for better compatibility with Whisper
-    audio = AudioSegment.from_file(audio_file)
-    audio = audio.set_frame_rate(16000).set_channels(1)
-    processed_audio_path = "processed_audio.wav"
-    audio.export(processed_audio_path, format="wav")
-    # Load the appropriate model based on the selected language
-    if language == "Sinhala" and sinhala_model is not None:
-        print("Using fine-tuned Sinhala model.")
-        model = sinhala_model
-        processor = sinhala_processor
-    else:
-        print("Using base Whisper model.")
-        model = base_model
-        processor = None
-    # Transcribe the audio
-    if language == "Auto Detect":
-        result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
-        detected_language = result.get("language", "unknown")
-    else:
-        language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
-        result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
-        detected_language = language_code
-    # Clean up processed audio file
-    os.remove(processed_audio_path)
-    # Return transcription and detected language
-    return f"Detected Language: {detected_language}\n\nTranscription:\n{result['text']}"
 # Mapping of full language names to language codes
 LANGUAGE_NAME_TO_CODE = {
@@ -152,21 +119,69 @@ LANGUAGE_NAME_TO_CODE = {
     "Sundanese": "su",
 }
 # Define the Gradio interface
-iface = gr.Interface(
-    fn=transcribe_audio,
-    inputs=[
-        gr.Audio(type="filepath", label="Upload Audio File"),
-        gr.Dropdown(
             choices=list(LANGUAGE_NAME_TO_CODE.keys()),  # Full language names
             label="Select Language",
             value="Auto Detect"
         )
-    ],
-    outputs=gr.Textbox(label="Transcription and Detected Language"),
-    title="Audio Transcription with Language Selection",
-    description="Upload an audio file and select a language (or choose 'Auto Detect'). For Sinhala, a fine-tuned model will be used automatically."
-)
 # Launch the Gradio interface
-iface.launch()

 import os
 from pydub import AudioSegment
+# Mapping of model names to Whisper model sizes
+MODELS = {
+    "Tiny (Fastest)": "tiny",
+    "Base (Faster)": "base",
+    "Small (Balanced)": "small",
+    "Medium (Accurate)": "medium",
+    "Large (Most Accurate)": "large"
+}
+# Fine-tuned Sinhala model
+SINHALA_MODEL = "malakazzz/Subhaka-whisper-small-Sinhala-Fine_Tune"
 # Mapping of full language names to language codes
 LANGUAGE_NAME_TO_CODE = {
     "Sundanese": "su",
 }
+def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
+    """Transcribe the audio file."""
+    # Load the appropriate model
+    if language == "Sinhala":
+        # Use the fine-tuned Sinhala model
+        model = gr.load(SINHALA_MODEL)
+    else:
+        # Use the selected Whisper model
+        model = whisper.load_model(MODELS[model_size])
+    # Convert audio to 16kHz mono for better compatibility with Whisper
+    audio = AudioSegment.from_file(audio_file)
+    audio = audio.set_frame_rate(16000).set_channels(1)
+    processed_audio_path = "processed_audio.wav"
+    audio.export(processed_audio_path, format="wav")
+    # Transcribe the audio
+    if language == "Auto Detect":
+        result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
+        detected_language = result.get("language", "unknown")
+    else:
+        language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
+        result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
+        detected_language = language_code
+    # Clean up processed audio file
+    os.remove(processed_audio_path)
+    # Return transcription and detected language
+    return f"Detected Language: {detected_language}\n\nTranscription:\n{result['text']}"
 # Define the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Audio Transcription and Language Detection")
+    with gr.Tab("Transcribe Audio"):
+        gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.")
+        transcribe_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
+        language_dropdown = gr.Dropdown(
             choices=list(LANGUAGE_NAME_TO_CODE.keys()),  # Full language names
             label="Select Language",
             value="Auto Detect"
         )
+        model_dropdown = gr.Dropdown(
+            choices=list(MODELS.keys()),  # Model options
+            label="Select Model",
+            value="Base (Faster)",  # Default to "Base" model
+            interactive=True  # Allow model selection by default
+        )
+        transcribe_output = gr.Textbox(label="Transcription and Detected Language")
+        transcribe_button = gr.Button("Transcribe Audio")
+    # Update model dropdown based on language selection
+    def update_model_dropdown(language):
+        if language == "Sinhala":
+            return gr.Dropdown(interactive=False, value="Fine-Tuned Sinhala Model")
+        else:
+            return gr.Dropdown(choices=list(MODELS.keys()), interactive=True, value="Base (Faster)")
+    language_dropdown.change(update_model_dropdown, inputs=language_dropdown, outputs=model_dropdown)
+    # Link button to function
+    transcribe_button.click(transcribe_audio, inputs=[transcribe_audio_input, language_dropdown, model_dropdown], outputs=transcribe_output)
 # Launch the Gradio interface
+demo.launch()