Spaces:

Athspi
/

Ai-audio

Running

App Files Files Community

Athspi commited on Jan 11

Commit

d600bb8

verified ·

1 Parent(s): 0f8086c

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -8

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import gradio as gr
 import whisper
 import os
 from pydub import AudioSegment
 # Mapping of model names to Whisper model sizes
 MODELS = {
@@ -13,7 +15,7 @@ MODELS = {
 }
 # Fine-tuned Sinhala model
-SINHALA_MODEL = "malakazzz/Subhaka-whisper-small-Sinhala-Fine_Tune"
 # Mapping of full language names to language codes
 LANGUAGE_NAME_TO_CODE = {
@@ -119,15 +121,21 @@ LANGUAGE_NAME_TO_CODE = {
     "Sundanese": "su",
 }
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
     """Transcribe the audio file."""
     # Load the appropriate model
     if language == "Sinhala":
         # Use the fine-tuned Sinhala model
-        model = gr.load(SINHALA_MODEL)
     else:
         # Use the selected Whisper model
         model = whisper.load_model(MODELS[model_size])
     # Convert audio to 16kHz mono for better compatibility with Whisper
     audio = AudioSegment.from_file(audio_file)
@@ -137,18 +145,38 @@ def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faste
     # Transcribe the audio
     if language == "Auto Detect":
-        result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
-        detected_language = result.get("language", "unknown")
     else:
-        language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
-        result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
-        detected_language = language_code
     # Clean up processed audio file
     os.remove(processed_audio_path)
     # Return transcription and detected language
-    return f"Detected Language: {detected_language}\n\nTranscription:\n{result['text']}"
 # Define the Gradio interface
 with gr.Blocks() as demo:

 import whisper
 import os
 from pydub import AudioSegment
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+import torch
 # Mapping of model names to Whisper model sizes
 MODELS = {
 }
 # Fine-tuned Sinhala model
+SINHALA_MODEL = "Subhaka/whisper-small-Sinhala-Fine_Tune"
 # Mapping of full language names to language codes
 LANGUAGE_NAME_TO_CODE = {
     "Sundanese": "su",
 }
+# Load the fine-tuned Sinhala model and processor
+processor = AutoProcessor.from_pretrained(SINHALA_MODEL)
+sinhala_model = AutoModelForSpeechSeq2Seq.from_pretrained(SINHALA_MODEL)
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
     """Transcribe the audio file."""
     # Load the appropriate model
     if language == "Sinhala":
         # Use the fine-tuned Sinhala model
+        model = sinhala_model
+        model_processor = processor
     else:
         # Use the selected Whisper model
         model = whisper.load_model(MODELS[model_size])
+        model_processor = None
     # Convert audio to 16kHz mono for better compatibility with Whisper
     audio = AudioSegment.from_file(audio_file)
     # Transcribe the audio
     if language == "Auto Detect":
+        if model_processor:
+            # Use the fine-tuned Sinhala model for transcription
+            inputs = model_processor(processed_audio_path, return_tensors="pt", sampling_rate=16000)
+            with torch.no_grad():
+                generated_ids = model.generate(inputs.input_features)
+            transcription = model_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            detected_language = "si"
+        else:
+            # Use Whisper for auto-detection
+            result = model.transcribe(processed_audio_path, fp16=False)
+            transcription = result["text"]
+            detected_language = result.get("language", "unknown")
     else:
+        if model_processor:
+            # Use the fine-tuned Sinhala model for transcription
+            inputs = model_processor(processed_audio_path, return_tensors="pt", sampling_rate=16000)
+            with torch.no_grad():
+                generated_ids = model.generate(inputs.input_features)
+            transcription = model_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            detected_language = "si"
+        else:
+            # Use Whisper for transcription with the selected language
+            language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")
+            result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
+            transcription = result["text"]
+            detected_language = language_code
     # Clean up processed audio file
     os.remove(processed_audio_path)
     # Return transcription and detected language
+    return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
 # Define the Gradio interface
 with gr.Blocks() as demo: