Spaces:
Running
Running
alessandro trinca tornidor
feat: port whisper and faster-whisper support from https://github.com/Thiagohgl/ai-pronunciation-trainer
85b7206
import unittest | |
import torch | |
from torchaudio.transforms import Resample | |
import pronunciationTrainer | |
from constants import sample_rate_start, sample_rate_resample, MODEL_NAME_DEFAULT | |
from lambdaSpeechToScore import audioread_load | |
from tests import EVENTS_FOLDER, set_seed | |
trainer_sst_lambda = { | |
'de': pronunciationTrainer.getTrainer("de"), | |
'en': pronunciationTrainer.getTrainer("en") | |
} | |
transform = Resample(orig_freq=sample_rate_start, new_freq=sample_rate_resample) | |
def helper_neural_asr(language: str, model_name: str): | |
import models as mo | |
set_seed() | |
signal, _ = audioread_load(EVENTS_FOLDER / f"test_{language}_easy.wav") | |
signal_transformed = transform(torch.Tensor(signal)).unsqueeze(0) | |
signal_transformed_preprocessed = pronunciationTrainer.preprocessAudioStandalone(signal_transformed) | |
asr_model = mo.getASRModel(language, model_name=model_name) | |
asr_model.processAudio(signal_transformed_preprocessed) | |
audio_transcript = asr_model.getTranscript() | |
word_locations_in_samples = asr_model.getWordLocations() | |
return audio_transcript, word_locations_in_samples | |
class TestNeuralASR(unittest.TestCase): | |
def setUp(self): | |
import platform | |
import os | |
if platform.system() == "Windows" or platform.system() == "Win32": | |
os.environ["PYTHONUTF8"] = "1" | |
os.environ["IS_TESTING"] = "TRUE" | |
def tearDown(self): | |
import platform | |
import os | |
if platform.system() == "Windows" or platform.system() == "Win32" and "PYTHONUTF8" in os.environ: | |
del os.environ["PYTHONUTF8"] | |
del os.environ["IS_TESTING"] | |
def test_neural_asr_de_whisper(self): | |
self.maxDiff = None | |
for model_name in [MODEL_NAME_DEFAULT, "whisper"]: | |
audio_transcript, word_locations_in_samples = helper_neural_asr("de", model_name) | |
assert audio_transcript == ' Hallo, wie geht es dir?' | |
self.assertEqual(word_locations_in_samples, [ | |
{'end_ts': 5120.0, 'start_ts': 0.0, 'word': ' Hallo,'}, | |
{'end_ts': 10240.0, 'start_ts': 8640.0, 'word': ' wie'}, | |
{'end_ts': 13120.0, 'start_ts': 10240.0, 'word': ' geht'}, | |
{'end_ts': 16640.0, 'start_ts': 13120.0, 'word': ' es'}, | |
{'end_ts': 20160.0, 'start_ts': 16640.0, 'word': ' dir?'} | |
]) | |
def test_neural_asr_en_default(self): | |
self.maxDiff = None | |
for model_name in [MODEL_NAME_DEFAULT, "whisper"]: | |
audio_transcript, word_locations_in_samples = helper_neural_asr("en", model_name) | |
assert audio_transcript == ' Hi there, how are you?' | |
self.assertEqual(word_locations_in_samples, [ | |
{'end_ts': 2240.0, 'start_ts': 0.0, 'word': ' Hi'}, | |
{'end_ts': 4800.0, 'start_ts': 2240.0, 'word': ' there,'}, | |
{'end_ts': 9280.0, 'start_ts': 7360.0, 'word': ' how'}, | |
{'end_ts': 11200.0, 'start_ts': 9280.0, 'word': ' are'}, | |
{'end_ts': 13760.0, 'start_ts': 11200.0, 'word': ' you?'} | |
]) | |
def test_neural_asr_de_faster_whisper(self): | |
self.maxDiff = None | |
audio_transcript, word_locations_in_samples = helper_neural_asr("de", "faster_whisper") | |
assert audio_transcript == ' Hallo, wie geht es dir?' | |
self.assertEqual(word_locations_in_samples, [ | |
{'end_ts': 5120.0, 'start_ts': 0.0, 'word': ' Hallo,'}, | |
{'end_ts': 10240.0, 'start_ts': 8640.0, 'word': ' wie'}, | |
{'end_ts': 13120.0, 'start_ts': 10240.0, 'word': ' geht'}, | |
{'end_ts': 16640.0, 'start_ts': 13120.0, 'word': ' es'}, | |
{'end_ts': 20160.0, 'start_ts': 16640.0, 'word': ' dir?'} | |
]) | |
def test_neural_asr_en_faster_whisper(self): | |
self.maxDiff = None | |
audio_transcript, word_locations_in_samples = helper_neural_asr("en", "faster_whisper") | |
assert audio_transcript == ' Hi there, how are you?' | |
self.assertEqual(word_locations_in_samples, [ | |
{'end_ts': 2240.0, 'start_ts': 0.0, 'word': ' Hi'}, | |
{'end_ts': 4800.0, 'start_ts': 2240.0, 'word': ' there,'}, | |
{'end_ts': 9280.0, 'start_ts': 7360.0, 'word': ' how'}, | |
{'end_ts': 11200.0, 'start_ts': 9280.0, 'word': ' are'}, | |
{'end_ts': 14080.0, 'start_ts': 11200.0, 'word': ' you?'} | |
]) | |
def test_neural_asr_de_silero(self): | |
self.maxDiff = None | |
audio_transcript, word_locations_in_samples = helper_neural_asr("de", "silero") | |
assert audio_transcript == 'hallo wie geht es dir' | |
print("word_locations_in_samples:") | |
print(word_locations_in_samples) | |
self.assertEqual(word_locations_in_samples, [ | |
{'word': 'hallo', 'start_ts': 0.0, 'end_ts': 6773.68}, | |
{'word': 'wie', 'start_ts': 6773.68, 'end_ts': 10468.42}, | |
{'word': 'geht', 'start_ts': 10468.42, 'end_ts': 13547.37}, | |
{'word': 'es', 'start_ts': 13547.37, 'end_ts': 16626.32}, | |
{'word': 'dir', 'start_ts': 16626.32, 'end_ts': 20321.05} | |
]) | |
def test_neural_asr_en_silero(self): | |
self.maxDiff = None | |
audio_transcript, word_locations_in_samples = helper_neural_asr("en", "silero") | |
assert audio_transcript == 'i there how are you' | |
self.assertEqual(word_locations_in_samples, [ | |
{'end_ts': 1800.0, 'start_ts': 0.0, 'word': 'i'}, | |
{'end_ts': 5400.0, 'start_ts': 1800.0, 'word': 'there'}, | |
{'end_ts': 8400.0, 'start_ts': 5400.0, 'word': 'how'}, | |
{'end_ts': 12000.0, 'start_ts': 8400.0, 'word': 'are'}, | |
{'end_ts': 15000.0, 'start_ts': 12000.0, 'word': 'you'} | |
]) | |
if __name__ == '__main__': | |
unittest.main() |