Spaces:

KDM999
/

asr-multimodel-comparison

Running

File size: 5,412 Bytes

import gradio as gr
import random
import json
from difflib import SequenceMatcher
from jiwer import wer
import torchaudio
from transformers import pipeline
import os
import string

# Load metadata
with open("common_voice_en_validated_249_hf_ready.json") as f:
    data = json.load(f)

# Prepare dropdown options
ages = sorted(set(entry["age"] for entry in data))
genders = sorted(set(entry["gender"] for entry in data))
accents = sorted(set(entry["accent"] for entry in data))

# Load ASR pipelines
device = 0
pipe_whisper_medium = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device, generate_kwargs={"language": "en"})
pipe_whisper_base = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device, generate_kwargs={"language": "en"})
pipe_whisper_tiny = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device, generate_kwargs={"language": "en"})
pipe_wav2vec2_base_960h = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=device)
pipe_hubert_large_ls960_ft = pipeline("automatic-speech-recognition", model="facebook/hubert-large-ls960-ft", device=device)

# Functions
def convert_to_wav(file_path):
    wav_path = file_path.replace(".mp3", ".wav")
    if not os.path.exists(wav_path):
        waveform, sample_rate = torchaudio.load(file_path)
        waveform = waveform.mean(dim=0, keepdim=True)
        torchaudio.save(wav_path, waveform, sample_rate)
    return wav_path

def transcribe(pipe, file_path):
    result = pipe(file_path)
    return result["text"].strip().lower()

def highlight_differences(ref, hyp):
    sm = SequenceMatcher(None, ref.split(), hyp.split())
    result = []
    for opcode, i1, i2, j1, j2 in sm.get_opcodes():
        if opcode == "equal":
            result.extend(hyp.split()[j1:j2])
        else:
            wrong = hyp.split()[j1:j2]
            result.extend([f"<span style='color:red'>{w}</span>" for w in wrong])
    return " ".join(result)

def normalize(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.strip()

# Generate Audio
def generate_audio(age, gender, accent):
    filtered = [
        entry for entry in data
        if entry["age"] == age and entry["gender"] == gender and entry["accent"] == accent
    ]
    if not filtered:
        return None, "No matching sample."
    sample = random.choice(filtered)
    file_path = os.path.join("common_voice_en_validated_249", sample["path"])
    wav_file_path = convert_to_wav(file_path)
    return wav_file_path, wav_file_path

# Transcribe & Compare
def transcribe_audio(file_path):
    if not file_path:
        return "No file selected.", "", "", "", "", "", ""

    filename_mp3 = os.path.basename(file_path).replace(".wav", ".mp3")
    gold = ""
    for entry in data:
        if entry["path"].endswith(filename_mp3):
            gold = normalize(entry["sentence"])
            break
    if not gold:
        return "Reference not found.", "", "", "", "", "", ""

    outputs = {}
    models = {
        "openai/whisper-medium": pipe_whisper_medium,
        "openai/whisper-base": pipe_whisper_base,
        "openai/whisper-tiny": pipe_whisper_tiny,
        "facebook/wav2vec2-base-960h": pipe_wav2vec2_base_960h,
        "facebook/hubert-large-ls960-ft": pipe_hubert_large_ls960_ft,
    }

    for name, model in models.items():
        text = transcribe(model, file_path)
        clean = normalize(text)
        wer_score = wer(gold, clean)
        outputs[name] = f"<b>{name} (WER: {wer_score:.2f}):</b><br>{highlight_differences(gold, clean)}"

    return (gold, *outputs.values())

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# Comparing ASR Models on Diverse English Speech Samples")
    gr.Markdown("
        This demo compares the transcription performance of six automatic speech recognition (ASR) models on audio samples from English learners. "
        "Users can select speaker metadata (age, gender, accent) to explore how models handle diverse speech profiles. "
        "All samples are drawn from the validated subset (n=249) of the English dataset in the Common Voice Delta Segment 21.0 release.")

    with gr.Row():
        age = gr.Dropdown(choices=ages, label="Age")
        gender = gr.Dropdown(choices=genders, label="Gender")
        accent = gr.Dropdown(choices=accents, label="Accent")

    generate_btn = gr.Button("Get Audio")
    audio_output = gr.Audio(label="Audio", type="filepath", interactive=False)
    file_path_output = gr.Textbox(label="Audio File Path", visible=False)

    generate_btn.click(generate_audio, [age, gender, accent], [audio_output, file_path_output])

    transcribe_btn = gr.Button("Transcribe with All Models")
    gold_text = gr.Textbox(label="Reference (Gold Standard)")
    whisper_medium_html = gr.HTML(label="Whisper Medium")
    whisper_base_html = gr.HTML(label="Whisper Base")
    whisper_tiny_html = gr.HTML(label="Whisper Tiny")
    wav2vec_html = gr.HTML(label="Wav2Vec2 Base")
    hubert_html = gr.HTML(label="HuBERT Large")

    transcribe_btn.click(
        transcribe_audio,
        inputs=[file_path_output],
        outputs=[
            gold_text,
            whisper_medium_html,
            whisper_base_html,
            whisper_tiny_html,
            wav2vec_html,
            hubert_html,
        ],
    )

demo.launch()