import gradio as gr import random import json from difflib import SequenceMatcher from jiwer import wer import torchaudio from transformers import pipeline import os import string # Load metadata with open("common_voice_en_validated_249_hf_ready.json") as f: data = json.load(f) # Prepare dropdown options ages = sorted(set(entry["age"] for entry in data)) genders = sorted(set(entry["gender"] for entry in data)) accents = sorted(set(entry["accent"] for entry in data)) # Load ASR pipelines device = 0 pipe_whisper_medium = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device, generate_kwargs={"language": "en"}) pipe_whisper_base = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device, generate_kwargs={"language": "en"}) pipe_whisper_tiny = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device, generate_kwargs={"language": "en"}) pipe_wav2vec2_base_960h = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=device) pipe_hubert_large_ls960_ft = pipeline("automatic-speech-recognition", model="facebook/hubert-large-ls960-ft", device=device) # Functions def convert_to_wav(file_path): wav_path = file_path.replace(".mp3", ".wav") if not os.path.exists(wav_path): waveform, sample_rate = torchaudio.load(file_path) waveform = waveform.mean(dim=0, keepdim=True) torchaudio.save(wav_path, waveform, sample_rate) return wav_path def transcribe(pipe, file_path): result = pipe(file_path) return result["text"].strip().lower() def highlight_differences(ref, hyp): sm = SequenceMatcher(None, ref.split(), hyp.split()) result = [] for opcode, i1, i2, j1, j2 in sm.get_opcodes(): if opcode == "equal": result.extend(hyp.split()[j1:j2]) else: wrong = hyp.split()[j1:j2] result.extend([f"{w}" for w in wrong]) return " ".join(result) def normalize(text): text = text.lower() text = text.translate(str.maketrans('', '', string.punctuation)) return text.strip() # Generate Audio def generate_audio(age, gender, accent): filtered = [ entry for entry in data if entry["age"] == age and entry["gender"] == gender and entry["accent"] == accent ] if not filtered: return None, "No matching sample." sample = random.choice(filtered) file_path = os.path.join("common_voice_en_validated_249", sample["path"]) wav_file_path = convert_to_wav(file_path) return wav_file_path, wav_file_path # Transcribe & Compare def transcribe_audio(file_path): if not file_path: return "No file selected.", "", "", "", "", "", "" filename_mp3 = os.path.basename(file_path).replace(".wav", ".mp3") gold = "" for entry in data: if entry["path"].endswith(filename_mp3): gold = normalize(entry["sentence"]) break if not gold: return "Reference not found.", "", "", "", "", "", "" outputs = {} models = { "openai/whisper-medium": pipe_whisper_medium, "openai/whisper-base": pipe_whisper_base, "openai/whisper-tiny": pipe_whisper_tiny, "facebook/wav2vec2-base-960h": pipe_wav2vec2_base_960h, "facebook/hubert-large-ls960-ft": pipe_hubert_large_ls960_ft, } for name, model in models.items(): text = transcribe(model, file_path) clean = normalize(text) wer_score = wer(gold, clean) outputs[name] = f"{name} (WER: {wer_score:.2f}):
{highlight_differences(gold, clean)}" return (gold, *outputs.values()) # Gradio Interface with gr.Blocks() as demo: gr.Markdown("# Comparing ASR Models on Diverse English Speech Samples") gr.Markdown(" This demo compares the transcription performance of six automatic speech recognition (ASR) models on audio samples from English learners. " "Users can select speaker metadata (age, gender, accent) to explore how models handle diverse speech profiles. " "All samples are drawn from the validated subset (n=249) of the English dataset in the Common Voice Delta Segment 21.0 release.") with gr.Row(): age = gr.Dropdown(choices=ages, label="Age") gender = gr.Dropdown(choices=genders, label="Gender") accent = gr.Dropdown(choices=accents, label="Accent") generate_btn = gr.Button("Get Audio") audio_output = gr.Audio(label="Audio", type="filepath", interactive=False) file_path_output = gr.Textbox(label="Audio File Path", visible=False) generate_btn.click(generate_audio, [age, gender, accent], [audio_output, file_path_output]) transcribe_btn = gr.Button("Transcribe with All Models") gold_text = gr.Textbox(label="Reference (Gold Standard)") whisper_medium_html = gr.HTML(label="Whisper Medium") whisper_base_html = gr.HTML(label="Whisper Base") whisper_tiny_html = gr.HTML(label="Whisper Tiny") wav2vec_html = gr.HTML(label="Wav2Vec2 Base") hubert_html = gr.HTML(label="HuBERT Large") transcribe_btn.click( transcribe_audio, inputs=[file_path_output], outputs=[ gold_text, whisper_medium_html, whisper_base_html, whisper_tiny_html, wav2vec_html, hubert_html, ], ) demo.launch()