Spaces:
Running
Running
File size: 4,949 Bytes
305c59b 8a6a9a9 1a0ef3f 8a6a9a9 305c59b c53ccee 305c59b bda7faf 959d3d3 bda7faf 49d93f9 305c59b bda7faf 8ff4639 c53ccee 305c59b 49d93f9 8ff4639 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import gradio as gr
import whisper
import os
from pydub import AudioSegment
# Load the base Whisper model
base_model = whisper.load_model("base") # Default model for non-Sinhala languages
# Load the fine-tuned Sinhala model (if available)
sinhala_model = None
try:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
sinhala_model = WhisperForConditionalGeneration.from_pretrained("Subhaka/whisper-small-Sinhala-Fine_Tune")
sinhala_processor = WhisperProcessor.from_pretrained("Subhaka/whisper-small-Sinhala-Fine_Tune")
except Exception as e:
print("Failed to load fine-tuned Sinhala model. Falling back to the base model.")
print(f"Error: {e}")
def transcribe_audio(audio_file, language="Auto Detect"):
# Convert audio to 16kHz mono for better compatibility with Whisper
audio = AudioSegment.from_file(audio_file)
audio = audio.set_frame_rate(16000).set_channels(1)
processed_audio_path = "processed_audio.wav"
audio.export(processed_audio_path, format="wav")
# Load the appropriate model based on the selected language
if language == "Sinhala" and sinhala_model is not None:
print("Using fine-tuned Sinhala model.")
model = sinhala_model
processor = sinhala_processor
else:
print("Using base Whisper model.")
model = base_model
processor = None
# Transcribe the audio
if language == "Auto Detect":
result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
detected_language = result.get("language", "unknown")
else:
language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
detected_language = language_code
# Clean up processed audio file
os.remove(processed_audio_path)
# Return transcription and detected language
return f"Detected Language: {detected_language}\n\nTranscription:\n{result['text']}"
# Mapping of full language names to language codes
LANGUAGE_NAME_TO_CODE = {
"Auto Detect": "Auto Detect",
"English": "en",
"Chinese": "zh",
"German": "de",
"Spanish": "es",
"Russian": "ru",
"Korean": "ko",
"French": "fr",
"Japanese": "ja",
"Portuguese": "pt",
"Turkish": "tr",
"Polish": "pl",
"Catalan": "ca",
"Dutch": "nl",
"Arabic": "ar",
"Swedish": "sv",
"Italian": "it",
"Indonesian": "id",
"Hindi": "hi",
"Finnish": "fi",
"Vietnamese": "vi",
"Hebrew": "he",
"Ukrainian": "uk",
"Greek": "el",
"Malay": "ms",
"Czech": "cs",
"Romanian": "ro",
"Danish": "da",
"Hungarian": "hu",
"Tamil": "ta",
"Norwegian": "no",
"Thai": "th",
"Urdu": "ur",
"Croatian": "hr",
"Bulgarian": "bg",
"Lithuanian": "lt",
"Latin": "la",
"Maori": "mi",
"Malayalam": "ml",
"Welsh": "cy",
"Slovak": "sk",
"Telugu": "te",
"Persian": "fa",
"Latvian": "lv",
"Bengali": "bn",
"Serbian": "sr",
"Azerbaijani": "az",
"Slovenian": "sl",
"Kannada": "kn",
"Estonian": "et",
"Macedonian": "mk",
"Breton": "br",
"Basque": "eu",
"Icelandic": "is",
"Armenian": "hy",
"Nepali": "ne",
"Mongolian": "mn",
"Bosnian": "bs",
"Kazakh": "kk",
"Albanian": "sq",
"Swahili": "sw",
"Galician": "gl",
"Marathi": "mr",
"Punjabi": "pa",
"Sinhala": "si", # Sinhala support
"Khmer": "km",
"Shona": "sn",
"Yoruba": "yo",
"Somali": "so",
"Afrikaans": "af",
"Occitan": "oc",
"Georgian": "ka",
"Belarusian": "be",
"Tajik": "tg",
"Sindhi": "sd",
"Gujarati": "gu",
"Amharic": "am",
"Yiddish": "yi",
"Lao": "lo",
"Uzbek": "uz",
"Faroese": "fo",
"Haitian Creole": "ht",
"Pashto": "ps",
"Turkmen": "tk",
"Nynorsk": "nn",
"Maltese": "mt",
"Sanskrit": "sa",
"Luxembourgish": "lb",
"Burmese": "my",
"Tibetan": "bo",
"Tagalog": "tl",
"Malagasy": "mg",
"Assamese": "as",
"Tatar": "tt",
"Hawaiian": "haw",
"Lingala": "ln",
"Hausa": "ha",
"Bashkir": "ba",
"Javanese": "jw",
"Sundanese": "su",
}
# Define the Gradio interface
iface = gr.Interface(
fn=transcribe_audio,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File"),
gr.Dropdown(
choices=list(LANGUAGE_NAME_TO_CODE.keys()), # Full language names
label="Select Language",
value="Auto Detect"
)
],
outputs=gr.Textbox(label="Transcription and Detected Language"),
title="Audio Transcription with Language Selection",
description="Upload an audio file and select a language (or choose 'Auto Detect'). For Sinhala, a fine-tuned model will be used automatically."
)
# Launch the Gradio interface
iface.launch() |