|
# Evaluate ASR models |
|
|
|
This is a breakdown of the steps to evaluate ASR models on a small subset of the Librispeech dataset based on the script in the `evaluate_asr.py` file. |
|
|
|
## 0. Import the necessary libraries |
|
|
|
```python |
|
from datasets import load_dataset, Dataset |
|
from transformers import pipeline |
|
import evaluate |
|
import torch |
|
import numpy as np |
|
from tqdm import tqdm |
|
import gradio as gr |
|
from collections import defaultdict |
|
import json |
|
``` |
|
|
|
|
|
## 1. Pick a speech dataset (English)from the Hugging Face hub and create a small subset of this dataset (100 rows) by streaming the data |
|
|
|
We will use the `librispeech_asr` dataset from the Hugging Face hub. We will use the `clean` split and the `validation` subset. |
|
|
|
```python |
|
# Load data |
|
ds = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True) |
|
ds = ds.take(100) |
|
``` |
|
## 3. Pick three transformers-compatible speech recognition models |
|
|
|
We will evaluate the following models: |
|
- `openai/whisper-tiny.en` |
|
- `facebook/wav2vec2-base-960h` |
|
- `distil-whisper/distil-small.en` |
|
|
|
```python |
|
model_name = { |
|
"whisper-tiny": "openai/whisper-tiny.en", |
|
"wav2vec2-large-960h": "facebook/wav2vec2-base-960h", |
|
"distill-whisper-small": "distil-whisper/distil-small.en", |
|
} |
|
``` |
|
|
|
## 4. Evaluate the models on the dataset |
|
|
|
```python |
|
def evaluate_model(ds, pipe, wer_metric): |
|
wer_scores = [] |
|
wer_results = [] |
|
for idx, sample in enumerate(tqdm(ds, desc="Evaluating", total=len(list(ds)))): |
|
audio_sample = sample["audio"] |
|
transcription = pipe(audio_sample["array"])['text'] |
|
# Keep only letter and spaces for evaluation |
|
transcription = "".join([char for char in transcription if char.isalpha() or char.isspace()]) |
|
wer = wer_metric.compute(predictions=[transcription.upper()], references=[sample["text"].upper()]) |
|
wer_scores.append(wer) |
|
wer_results.append({ |
|
"index": idx, |
|
"transcription": transcription.upper(), |
|
"reference": sample["text"].upper(), |
|
"wer": wer |
|
}) |
|
return wer_scores, wer_results |
|
|
|
# Load WER metric |
|
wer_metric = evaluate.load("wer") |
|
|
|
results = {} |
|
model_wer_results = {} |
|
# Evaluate model |
|
for model in model_name: |
|
pipe = pipeline("automatic-speech-recognition", model=model_name[model]) |
|
wer_scores, wer_results = evaluate_model(ds, pipe, wer_metric) |
|
results[model] = np.mean(wer_scores) |
|
model_wer_results[model] = wer_results |
|
|
|
for model in results: |
|
print(f"Model: {model}, WER: {results[model]}") |
|
``` |