Update app.py
Browse files
app.py
CHANGED
@@ -5,9 +5,7 @@ import os
|
|
5 |
from difflib import SequenceMatcher
|
6 |
from jiwer import wer
|
7 |
import torchaudio
|
8 |
-
import
|
9 |
-
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, HubertForCTC
|
10 |
-
import whisper
|
11 |
|
12 |
# Load metadata
|
13 |
with open("common_voice_en_validated_249_hf_ready.json") as f:
|
@@ -18,44 +16,21 @@ ages = sorted(set(entry["age"] for entry in data))
|
|
18 |
genders = sorted(set(entry["gender"] for entry in data))
|
19 |
accents = sorted(set(entry["accent"] for entry in data))
|
20 |
|
21 |
-
# Load
|
22 |
-
device =
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
# Wav2Vec2
|
28 |
-
wav2vec_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
|
29 |
-
wav2vec_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to(device)
|
30 |
-
|
31 |
-
# HuBERT
|
32 |
-
hubert_processor = HubertProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
|
33 |
-
hubert_model = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft").to(device)
|
34 |
|
35 |
def load_audio(file_path):
|
36 |
waveform, sr = torchaudio.load(file_path)
|
37 |
return torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)[0].numpy()
|
38 |
|
39 |
-
def
|
40 |
-
result =
|
41 |
return result["text"].strip().lower()
|
42 |
|
43 |
-
def transcribe_wav2vec(file_path):
|
44 |
-
audio = load_audio(file_path)
|
45 |
-
inputs = wav2vec_processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
|
46 |
-
with torch.no_grad():
|
47 |
-
logits = wav2vec_model(**inputs.to(device)).logits
|
48 |
-
predicted_ids = torch.argmax(logits, dim=-1)
|
49 |
-
return wav2vec_processor.batch_decode(predicted_ids)[0].strip().lower()
|
50 |
-
|
51 |
-
def transcribe_hubert(file_path):
|
52 |
-
audio = load_audio(file_path)
|
53 |
-
inputs = hubert_processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
|
54 |
-
with torch.no_grad():
|
55 |
-
logits = hubert_model(**inputs.to(device)).logits
|
56 |
-
predicted_ids = torch.argmax(logits, dim=-1)
|
57 |
-
return hubert_processor.batch_decode(predicted_ids)[0].strip().lower()
|
58 |
-
|
59 |
def highlight_differences(ref, hyp):
|
60 |
sm = SequenceMatcher(None, ref.split(), hyp.split())
|
61 |
result = []
|
@@ -79,9 +54,9 @@ def run_demo(age, gender, accent):
|
|
79 |
file_path = os.path.join("common_voice_en_validated_249", sample["path"])
|
80 |
gold = sample["sentence"].strip().lower()
|
81 |
|
82 |
-
whisper_text =
|
83 |
-
wav2vec_text =
|
84 |
-
hubert_text =
|
85 |
|
86 |
table = f"""
|
87 |
<table border="1" style="width:100%">
|
@@ -118,4 +93,4 @@ with gr.Blocks() as demo:
|
|
118 |
gr.Textbox(label="Path")
|
119 |
])
|
120 |
|
121 |
-
demo.launch()
|
|
|
5 |
from difflib import SequenceMatcher
|
6 |
from jiwer import wer
|
7 |
import torchaudio
|
8 |
+
from transformers import pipeline
|
|
|
|
|
9 |
|
10 |
# Load metadata
|
11 |
with open("common_voice_en_validated_249_hf_ready.json") as f:
|
|
|
16 |
genders = sorted(set(entry["gender"] for entry in data))
|
17 |
accents = sorted(set(entry["accent"] for entry in data))
|
18 |
|
19 |
+
# Load pipelines
|
20 |
+
device = 0 # 0 for CUDA/GPU, -1 for CPU
|
21 |
|
22 |
+
pipe_whisper = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)
|
23 |
+
pipe_wav2vec2 = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=device)
|
24 |
+
pipe_hubert = pipeline("automatic-speech-recognition", model="facebook/hubert-base-ls960", device=device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def load_audio(file_path):
|
27 |
waveform, sr = torchaudio.load(file_path)
|
28 |
return torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)[0].numpy()
|
29 |
|
30 |
+
def transcribe(pipe, file_path):
|
31 |
+
result = pipe(file_path)
|
32 |
return result["text"].strip().lower()
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
def highlight_differences(ref, hyp):
|
35 |
sm = SequenceMatcher(None, ref.split(), hyp.split())
|
36 |
result = []
|
|
|
54 |
file_path = os.path.join("common_voice_en_validated_249", sample["path"])
|
55 |
gold = sample["sentence"].strip().lower()
|
56 |
|
57 |
+
whisper_text = transcribe(pipe_whisper, file_path)
|
58 |
+
wav2vec_text = transcribe(pipe_wav2vec2, file_path)
|
59 |
+
hubert_text = transcribe(pipe_hubert, file_path)
|
60 |
|
61 |
table = f"""
|
62 |
<table border="1" style="width:100%">
|
|
|
93 |
gr.Textbox(label="Path")
|
94 |
])
|
95 |
|
96 |
+
demo.launch()
|