Spaces:

KDM999
/

asr-multimodel-comparison

Running

App Files Files Community

KDM999 commited on Apr 6

Commit

8cc7c73

verified ·

1 Parent(s): a247854

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -37

app.py CHANGED Viewed

@@ -5,9 +5,7 @@ import os
 from difflib import SequenceMatcher
 from jiwer import wer
 import torchaudio
-import torch
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, HubertForCTC
-import whisper
 # Load metadata
 with open("common_voice_en_validated_249_hf_ready.json") as f:
@@ -18,44 +16,21 @@ ages = sorted(set(entry["age"] for entry in data))
 genders = sorted(set(entry["gender"] for entry in data))
 accents = sorted(set(entry["accent"] for entry in data))
-# Load models
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Whisper
-whisper_model = whisper.load_model("medium").to(device)
-# Wav2Vec2
-wav2vec_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
-wav2vec_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to(device)
-# HuBERT
-hubert_processor = HubertProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
-hubert_model = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft").to(device)
 def load_audio(file_path):
     waveform, sr = torchaudio.load(file_path)
     return torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)[0].numpy()
-def transcribe_whisper(file_path):
-    result = whisper_model.transcribe(file_path)
     return result["text"].strip().lower()
-def transcribe_wav2vec(file_path):
-    audio = load_audio(file_path)
-    inputs = wav2vec_processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
-    with torch.no_grad():
-        logits = wav2vec_model(**inputs.to(device)).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    return wav2vec_processor.batch_decode(predicted_ids)[0].strip().lower()
-def transcribe_hubert(file_path):
-    audio = load_audio(file_path)
-    inputs = hubert_processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
-    with torch.no_grad():
-        logits = hubert_model(**inputs.to(device)).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    return hubert_processor.batch_decode(predicted_ids)[0].strip().lower()
 def highlight_differences(ref, hyp):
     sm = SequenceMatcher(None, ref.split(), hyp.split())
     result = []
@@ -79,9 +54,9 @@ def run_demo(age, gender, accent):
     file_path = os.path.join("common_voice_en_validated_249", sample["path"])
     gold = sample["sentence"].strip().lower()
-    whisper_text = transcribe_whisper(file_path)
-    wav2vec_text = transcribe_wav2vec(file_path)
-    hubert_text = transcribe_hubert(file_path)
     table = f"""
     <table border="1" style="width:100%">
@@ -118,4 +93,4 @@ with gr.Blocks() as demo:
         gr.Textbox(label="Path")
     ])
-demo.launch()

 from difflib import SequenceMatcher
 from jiwer import wer
 import torchaudio
+from transformers import pipeline
 # Load metadata
 with open("common_voice_en_validated_249_hf_ready.json") as f:
 genders = sorted(set(entry["gender"] for entry in data))
 accents = sorted(set(entry["accent"] for entry in data))
+# Load pipelines
+device = 0  # 0 for CUDA/GPU, -1 for CPU
+pipe_whisper = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)
+pipe_wav2vec2 = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=device)
+pipe_hubert = pipeline("automatic-speech-recognition", model="facebook/hubert-base-ls960", device=device)
 def load_audio(file_path):
     waveform, sr = torchaudio.load(file_path)
     return torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)[0].numpy()
+def transcribe(pipe, file_path):
+    result = pipe(file_path)
     return result["text"].strip().lower()
 def highlight_differences(ref, hyp):
     sm = SequenceMatcher(None, ref.split(), hyp.split())
     result = []
     file_path = os.path.join("common_voice_en_validated_249", sample["path"])
     gold = sample["sentence"].strip().lower()
+    whisper_text = transcribe(pipe_whisper, file_path)
+    wav2vec_text = transcribe(pipe_wav2vec2, file_path)
+    hubert_text = transcribe(pipe_hubert, file_path)
     table = f"""
     <table border="1" style="width:100%">
         gr.Textbox(label="Path")
     ])
+demo.launch()