Spaces:
Running
Running
File size: 6,459 Bytes
305c59b 8a6a9a9 7d07125 1a0ef3f 8a6a9a9 ac8d452 305c59b fce1940 c53ccee ac8d452 7d07125 ac8d452 7d07125 ac8d452 7d07125 305c59b bda7faf 959d3d3 bda7faf fce1940 7d07125 fce1940 7d07125 ac8d452 7d07125 fce1940 7d07125 d600bb8 7d07125 d600bb8 7d07125 fce1940 d600bb8 fce1940 49d93f9 fce1940 7d07125 fce1940 bda7faf fce1940 ac8d452 7d07125 fce1940 305c59b 49d93f9 fce1940 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
import gradio as gr
import whisper
import torch
import os
from pydub import AudioSegment
from transformers import pipeline
# Mapping of model names to Whisper model sizes
MODELS = {
"Tiny (Fastest)": "tiny",
"Base (Faster)": "base",
"Small (Balanced)": "small",
"Medium (Accurate)": "medium",
"Large (Most Accurate)": "large"
}
# Fine-tuned models for specific languages
FINE_TUNED_MODELS = {
"Tamil": {
"model": "vasista22/whisper-tamil-medium",
"language": "ta"
},
# Add more fine-tuned models for other languages here
}
# Mapping of full language names to language codes
LANGUAGE_NAME_TO_CODE = {
"Auto Detect": "Auto Detect",
"English": "en",
"Chinese": "zh",
"German": "de",
"Spanish": "es",
"Russian": "ru",
"Korean": "ko",
"French": "fr",
"Japanese": "ja",
"Portuguese": "pt",
"Turkish": "tr",
"Polish": "pl",
"Catalan": "ca",
"Dutch": "nl",
"Arabic": "ar",
"Swedish": "sv",
"Italian": "it",
"Indonesian": "id",
"Hindi": "hi",
"Finnish": "fi",
"Vietnamese": "vi",
"Hebrew": "he",
"Ukrainian": "uk",
"Greek": "el",
"Malay": "ms",
"Czech": "cs",
"Romanian": "ro",
"Danish": "da",
"Hungarian": "hu",
"Tamil": "ta",
"Norwegian": "no",
"Thai": "th",
"Urdu": "ur",
"Croatian": "hr",
"Bulgarian": "bg",
"Lithuanian": "lt",
"Latin": "la",
"Maori": "mi",
"Malayalam": "ml",
"Welsh": "cy",
"Slovak": "sk",
"Telugu": "te",
"Persian": "fa",
"Latvian": "lv",
"Bengali": "bn",
"Serbian": "sr",
"Azerbaijani": "az",
"Slovenian": "sl",
"Kannada": "kn",
"Estonian": "et",
"Macedonian": "mk",
"Breton": "br",
"Basque": "eu",
"Icelandic": "is",
"Armenian": "hy",
"Nepali": "ne",
"Mongolian": "mn",
"Bosnian": "bs",
"Kazakh": "kk",
"Albanian": "sq",
"Swahili": "sw",
"Galician": "gl",
"Marathi": "mr",
"Punjabi": "pa",
"Sinhala": "si", # Sinhala support
"Khmer": "km",
"Shona": "sn",
"Yoruba": "yo",
"Somali": "so",
"Afrikaans": "af",
"Occitan": "oc",
"Georgian": "ka",
"Belarusian": "be",
"Tajik": "tg",
"Sindhi": "sd",
"Gujarati": "gu",
"Amharic": "am",
"Yiddish": "yi",
"Lao": "lo",
"Uzbek": "uz",
"Faroese": "fo",
"Haitian Creole": "ht",
"Pashto": "ps",
"Turkmen": "tk",
"Nynorsk": "nn",
"Maltese": "mt",
"Sanskrit": "sa",
"Luxembourgish": "lb",
"Burmese": "my",
"Tibetan": "bo",
"Tagalog": "tl",
"Malagasy": "mg",
"Assamese": "as",
"Tatar": "tt",
"Hawaiian": "haw",
"Lingala": "ln",
"Hausa": "ha",
"Bashkir": "ba",
"Javanese": "jw",
"Sundanese": "su",
}
def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
"""Transcribe the audio file."""
# Convert audio to 16kHz mono for better compatibility
audio = AudioSegment.from_file(audio_file)
audio = audio.set_frame_rate(16000).set_channels(1)
processed_audio_path = "processed_audio.wav"
audio.export(processed_audio_path, format="wav")
# Load the appropriate model
if language in FINE_TUNED_MODELS:
# Use the fine-tuned Whisper model for the selected language
device = "cuda:0" if torch.cuda.is_available() else "cpu"
transcribe = pipeline(
task="automatic-speech-recognition",
model=FINE_TUNED_MODELS[language]["model"],
chunk_length_s=30,
device=device
)
transcribe.model.config.forced_decoder_ids = transcribe.tokenizer.get_decoder_prompt_ids(
language=FINE_TUNED_MODELS[language]["language"],
task="transcribe"
)
result = transcribe(processed_audio_path)
transcription = result["text"]
detected_language = language
else:
# Use the selected Whisper model
model = whisper.load_model(MODELS[model_size])
# Transcribe the audio
if language == "Auto Detect":
result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
detected_language = result.get("language", "unknown")
else:
language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
detected_language = language_code
transcription = result["text"]
# Clean up processed audio file
os.remove(processed_audio_path)
# Return transcription and detected language
return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
# Define the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Audio Transcription with Fine-Tuned Models")
with gr.Tab("Transcribe Audio"):
gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.")
transcribe_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
language_dropdown = gr.Dropdown(
choices=list(LANGUAGE_NAME_TO_CODE.keys()), # Full language names
label="Select Language",
value="Auto Detect"
)
model_dropdown = gr.Dropdown(
choices=list(MODELS.keys()), # Model options
label="Select Model",
value="Base (Faster)", # Default to "Base" model
interactive=True # Allow model selection by default
)
transcribe_output = gr.Textbox(label="Transcription and Detected Language")
transcribe_button = gr.Button("Transcribe Audio")
# Update model dropdown based on language selection
def update_model_dropdown(language):
if language in FINE_TUNED_MODELS:
return gr.Dropdown(interactive=False, value=f"Fine-Tuned {language} Model")
else:
return gr.Dropdown(choices=list(MODELS.keys()), interactive=True, value="Base (Faster)")
language_dropdown.change(update_model_dropdown, inputs=language_dropdown, outputs=model_dropdown)
# Link button to function
transcribe_button.click(transcribe_audio, inputs=[transcribe_audio_input, language_dropdown, model_dropdown], outputs=transcribe_output)
# Launch the Gradio interface
demo.launch() |