Spaces:
Running
Running
"""TTSEngine のテスト""" | |
from test.utility import pydantic_to_native_type, round_floats, summarize_big_ndarray | |
from unittest.mock import MagicMock | |
import numpy as np | |
import pytest | |
from syrupy.assertion import SnapshotAssertion | |
from voicevox_engine.dev.core.mock import MockCoreWrapper | |
from voicevox_engine.metas.Metas import StyleId | |
from voicevox_engine.model import AudioQuery | |
from voicevox_engine.tts_pipeline.model import ( | |
AccentPhrase, | |
FrameAudioQuery, | |
Mora, | |
Note, | |
Score, | |
) | |
from voicevox_engine.tts_pipeline.text_analyzer import text_to_accent_phrases | |
from voicevox_engine.tts_pipeline.tts_engine import ( | |
TTSEngine, | |
_apply_interrogative_upspeak, | |
_to_flatten_phonemes, | |
to_flatten_moras, | |
) | |
from .test_text_analyzer import stub_unknown_features_koxx | |
from .tts_utils import gen_mora, sec | |
def test_to_flatten_phonemes() -> None: | |
"""Test `_to_flatten_phonemes()`.""" | |
# Inputs | |
moras = [ | |
gen_mora(" ", None, None, "sil", sec(2), 0.0), | |
gen_mora("ヒ", "h", sec(2), "i", sec(4), 5.0), | |
gen_mora(" ", None, None, "sil", sec(6), 0.0), | |
] | |
# Expects | |
true_phoneme_strs = ["pau", "h", "i", "pau"] | |
# Outputs | |
phonemes = _to_flatten_phonemes(moras) | |
phoneme_strs = list(map(lambda p: p._phoneme, phonemes)) | |
# Test | |
assert true_phoneme_strs == phoneme_strs | |
def _gen_hello_hiho_accent_phrases() -> list[AccentPhrase]: | |
return [ | |
AccentPhrase( | |
moras=[ | |
gen_mora("コ", "k", 0.1, "o", 0.1, 5.0), | |
gen_mora("ン", None, None, "N", 0.1, 5.0), | |
gen_mora("ニ", "n", 0.1, "i", 0.1, 5.0), | |
gen_mora("チ", "ch", 0.1, "i", 0.1, 5.0), | |
gen_mora("ワ", "w", 0.1, "a", 0.1, 5.0), | |
], | |
accent=5, | |
pause_mora=gen_mora("、", None, None, "pau", 0.1, 0.0), | |
), | |
AccentPhrase( | |
moras=[ | |
gen_mora("ヒ", "h", 0.1, "i", 0.1, 0.0), | |
gen_mora("ホ", "h", 0.1, "o", 0.1, 5.0), | |
gen_mora("デ", "d", 0.1, "e", 0.1, 5.0), | |
gen_mora("ス", "s", 0.1, "U", 0.1, 0.0), | |
], | |
accent=1, | |
pause_mora=None, | |
), | |
] | |
def _gen_hello_hiho_query() -> AudioQuery: | |
return AudioQuery( | |
accent_phrases=_gen_hello_hiho_accent_phrases(), | |
speedScale=2.0, | |
pitchScale=1.1, | |
intonationScale=0.9, | |
volumeScale=1.3, | |
prePhonemeLength=0.1, | |
postPhonemeLength=0.2, | |
pauseLength=0.3, | |
pauseLengthScale=0.8, | |
outputSamplingRate=12000, | |
outputStereo=True, | |
kana="コンニチワ'、ヒ'ホデ_ス", | |
) | |
def _gen_doremi_score() -> Score: | |
return Score( | |
notes=[ | |
Note(key=None, frame_length=10, lyric=""), | |
Note(key=60, frame_length=12, lyric="ど"), | |
Note(key=62, frame_length=17, lyric="れ"), | |
Note(key=64, frame_length=21, lyric="み"), | |
Note(key=None, frame_length=5, lyric=""), | |
Note(key=65, frame_length=12, lyric="ふぁ"), | |
Note(key=67, frame_length=17, lyric="そ"), | |
Note(key=None, frame_length=10, lyric=""), | |
] | |
) | |
def test_to_flatten_moras() -> None: | |
flatten_moras = to_flatten_moras(_gen_hello_hiho_accent_phrases()) | |
true_accent_phrases_hello_hiho = _gen_hello_hiho_accent_phrases() | |
assert ( | |
flatten_moras | |
== true_accent_phrases_hello_hiho[0].moras | |
+ [true_accent_phrases_hello_hiho[0].pause_mora] | |
+ true_accent_phrases_hello_hiho[1].moras | |
) | |
def test_update_length() -> None: | |
core = MockCoreWrapper() | |
core.yukarin_s_forward = MagicMock(wraps=core.yukarin_s_forward) # type: ignore[method-assign] | |
_yukarin_s_mock = core.yukarin_s_forward | |
tts_engine = TTSEngine(core=core) | |
# Inputs | |
hello_hiho = _gen_hello_hiho_accent_phrases() | |
# Indirect Outputs(yukarin_sに渡される値) | |
tts_engine.update_length(hello_hiho, StyleId(1)) | |
yukarin_s_args = _yukarin_s_mock.call_args[1] | |
list_length = yukarin_s_args["length"] | |
phoneme_list = yukarin_s_args["phoneme_list"] | |
style_id = yukarin_s_args["style_id"] | |
# Expects | |
true_list_length = 20 | |
true_style_id = 1 | |
true_phoneme_list_1 = [0, 23, 30, 4, 28, 21, 10, 21, 42, 7] | |
true_phoneme_list_2 = [0, 19, 21, 19, 30, 12, 14, 35, 6, 0] | |
true_phoneme_list = true_phoneme_list_1 + true_phoneme_list_2 | |
assert list_length == true_list_length | |
assert list_length == len(phoneme_list) | |
assert style_id == true_style_id | |
np.testing.assert_array_equal( | |
phoneme_list, | |
np.array(true_phoneme_list, dtype=np.int64), | |
) | |
def test_update_pitch() -> None: | |
core = MockCoreWrapper() | |
core.yukarin_sa_forward = MagicMock(wraps=core.yukarin_sa_forward) # type: ignore[method-assign] | |
_yukarin_sa_mock = core.yukarin_sa_forward | |
tts_engine = TTSEngine(core=core) | |
# 空のリストでエラーを吐かないか | |
# Inputs | |
phrases: list = [] | |
# Outputs | |
result = tts_engine.update_pitch(phrases, StyleId(1)) | |
# Expects | |
true_result: list = [] | |
# Tests | |
assert result == true_result | |
# Inputs | |
hello_hiho = _gen_hello_hiho_accent_phrases() | |
# Indirect Outputs(yukarin_saに渡される値) | |
tts_engine.update_pitch(hello_hiho, StyleId(1)) | |
yukarin_sa_args = _yukarin_sa_mock.call_args[1] | |
list_length = yukarin_sa_args["length"] | |
vowel_phoneme_list = yukarin_sa_args["vowel_phoneme_list"][0] | |
consonant_phoneme_list = yukarin_sa_args["consonant_phoneme_list"][0] | |
start_accent_list = yukarin_sa_args["start_accent_list"][0] | |
end_accent_list = yukarin_sa_args["end_accent_list"][0] | |
start_accent_phrase_list = yukarin_sa_args["start_accent_phrase_list"][0] | |
end_accent_phrase_list = yukarin_sa_args["end_accent_phrase_list"][0] | |
style_id = yukarin_sa_args["style_id"] | |
# Expects | |
true_vowels = np.array([0, 30, 4, 21, 21, 7, 0, 21, 30, 14, 6, 0]) | |
true_consonants = np.array([-1, 23, -1, 28, 10, 42, -1, 19, 19, 12, 35, -1]) | |
true_accent_starts = np.array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]) | |
true_accent_ends = np.array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]) | |
true_phrase_starts = np.array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]) | |
true_phrase_ends = np.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]) | |
# Tests | |
assert list_length == 12 | |
assert list_length == len(vowel_phoneme_list) | |
assert list_length == len(consonant_phoneme_list) | |
assert list_length == len(start_accent_list) | |
assert list_length == len(end_accent_list) | |
assert list_length == len(start_accent_phrase_list) | |
assert list_length == len(end_accent_phrase_list) | |
assert style_id == 1 | |
np.testing.assert_array_equal(vowel_phoneme_list, true_vowels) | |
np.testing.assert_array_equal(consonant_phoneme_list, true_consonants) | |
np.testing.assert_array_equal(start_accent_list, true_accent_starts) | |
np.testing.assert_array_equal(end_accent_list, true_accent_ends) | |
np.testing.assert_array_equal(start_accent_phrase_list, true_phrase_starts) | |
np.testing.assert_array_equal(end_accent_phrase_list, true_phrase_ends) | |
def test_create_accent_phrases_toward_unknown() -> None: | |
"""`TTSEngine.create_accent_phrases()` は unknown 音素の Phoneme 化に失敗する""" | |
engine = TTSEngine(MockCoreWrapper()) | |
# NOTE: TTSEngine.create_accent_phrases() のコールで unknown feature を得ることが難しいため、疑似再現 | |
accent_phrases = text_to_accent_phrases( | |
"dummy", text_to_features=stub_unknown_features_koxx | |
) | |
with pytest.raises(ValueError) as e: | |
accent_phrases = engine.update_length_and_pitch(accent_phrases, StyleId(0)) | |
assert str(e.value) == "tuple.index(x): x not in tuple" | |
def test_mocked_update_length_output(snapshot_json: SnapshotAssertion) -> None: | |
"""モックされた `TTSEngine.update_length()` の出力スナップショットが一定である""" | |
# Inputs | |
tts_engine = TTSEngine(MockCoreWrapper()) | |
hello_hiho = _gen_hello_hiho_accent_phrases() | |
# Outputs | |
result = tts_engine.update_length(hello_hiho, StyleId(1)) | |
# Tests | |
assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2) | |
def test_mocked_update_pitch_output(snapshot_json: SnapshotAssertion) -> None: | |
"""モックされた `TTSEngine.update_pitch()` の出力スナップショットが一定である""" | |
# Inputs | |
tts_engine = TTSEngine(MockCoreWrapper()) | |
hello_hiho = _gen_hello_hiho_accent_phrases() | |
# Outputs | |
result = tts_engine.update_pitch(hello_hiho, StyleId(1)) | |
# Tests | |
assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2) | |
def test_mocked_update_length_and_pitch_output( | |
snapshot_json: SnapshotAssertion, | |
) -> None: | |
"""モックされた `TTSEngine.update_length_and_pitch()` の出力スナップショットが一定である""" | |
# Inputs | |
tts_engine = TTSEngine(MockCoreWrapper()) | |
hello_hiho = _gen_hello_hiho_accent_phrases() | |
# Outputs | |
result = tts_engine.update_length_and_pitch(hello_hiho, StyleId(1)) | |
# Tests | |
assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2) | |
def test_mocked_create_accent_phrases_output( | |
snapshot_json: SnapshotAssertion, | |
) -> None: | |
"""モックされた `TTSEngine.create_accent_phrases()` の出力スナップショットが一定である""" | |
# Inputs | |
tts_engine = TTSEngine(MockCoreWrapper()) | |
hello_hiho = "こんにちは、ヒホです" | |
# Outputs | |
result = tts_engine.create_accent_phrases(hello_hiho, StyleId(1)) | |
# Tests | |
assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2) | |
def test_mocked_create_accent_phrases_from_kana_output( | |
snapshot_json: SnapshotAssertion, | |
) -> None: | |
"""モックされた `TTSEngine.create_accent_phrases_from_kana()` の出力スナップショットが一定である""" | |
# Inputs | |
tts_engine = TTSEngine(MockCoreWrapper()) | |
hello_hiho = "コンニチワ'、ヒ'ホデ_ス" | |
# Outputs | |
result = tts_engine.create_accent_phrases_from_kana(hello_hiho, StyleId(1)) | |
# Tests | |
assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2) | |
def test_mocked_synthesize_wave_output(snapshot_json: SnapshotAssertion) -> None: | |
"""モックされた `TTSEngine.synthesize_wave()` の出力スナップショットが一定である""" | |
# Inputs | |
tts_engine = TTSEngine(MockCoreWrapper()) | |
hello_hiho = _gen_hello_hiho_query() | |
# Outputs | |
result = tts_engine.synthesize_wave(hello_hiho, StyleId(1)) | |
# Tests | |
assert snapshot_json == summarize_big_ndarray(round_floats(result, round_value=2)) | |
def test_mocked_create_sing_volume_from_phoneme_and_f0_output( | |
snapshot_json: SnapshotAssertion, | |
) -> None: | |
""" | |
モックされた `TTSEngine.create_sing_phoneme_and_f0_and_volume()` の出力スナップショットが一定である | |
NOTE: 入力生成の簡略化に別関数を呼び出すため、別関数が正しく動作しない場合テストが落ちる | |
""" | |
# Inputs | |
tts_engine = TTSEngine(MockCoreWrapper()) | |
doremi_srore = _gen_doremi_score() | |
phonemes, f0s, _ = tts_engine.create_sing_phoneme_and_f0_and_volume( | |
doremi_srore, StyleId(1) | |
) | |
# Outputs | |
result = tts_engine.create_sing_volume_from_phoneme_and_f0( | |
doremi_srore, phonemes, f0s, StyleId(1) | |
) | |
# Tests | |
assert snapshot_json == round_floats(result, round_value=2) | |
def test_mocked_synthesize_wave_from_score_output( | |
snapshot_json: SnapshotAssertion, | |
) -> None: | |
""" | |
モックされた `TTSEngine.create_sing_phoneme_and_f0_and_volume()` と | |
`TTSEngine.frame_synthsize_wave()` の出力スナップショットが一定である | |
""" | |
# Inputs | |
tts_engine = TTSEngine(MockCoreWrapper()) | |
doremi_srore = _gen_doremi_score() | |
# Outputs | |
result = tts_engine.create_sing_phoneme_and_f0_and_volume(doremi_srore, StyleId(1)) | |
# Tests | |
assert snapshot_json(name="query") == round_floats( | |
pydantic_to_native_type(result), round_value=2 | |
) | |
# Inputs | |
phonemes, f0, volume = result | |
doremi_query = FrameAudioQuery( | |
f0=f0, | |
volume=volume, | |
phonemes=phonemes, | |
volumeScale=1.3, | |
outputSamplingRate=1200, | |
outputStereo=False, | |
) | |
# Outputs | |
result_wave = tts_engine.frame_synthsize_wave(doremi_query, StyleId(1)) | |
# Tests | |
assert snapshot_json(name="wave") == round_floats( | |
result_wave.tolist(), round_value=2 | |
) | |
def koreha_arimasuka_base_expected() -> list[AccentPhrase]: | |
return [ | |
AccentPhrase( | |
moras=[ | |
Mora( | |
text="コ", | |
consonant="k", | |
consonant_length=np.float32(2.44), | |
vowel="o", | |
vowel_length=np.float32(2.88), | |
pitch=np.float32(4.38), | |
), | |
Mora( | |
text="レ", | |
consonant="r", | |
consonant_length=np.float32(3.06), | |
vowel="e", | |
vowel_length=np.float32(1.88), | |
pitch=np.float32(4.0), | |
), | |
Mora( | |
text="ワ", | |
consonant="w", | |
consonant_length=np.float32(3.62), | |
vowel="a", | |
vowel_length=np.float32(1.44), | |
pitch=np.float32(4.19), | |
), | |
], | |
accent=3, | |
pause_mora=None, | |
is_interrogative=False, | |
), | |
AccentPhrase( | |
moras=[ | |
Mora( | |
text="ア", | |
consonant=None, | |
consonant_length=None, | |
vowel="a", | |
vowel_length=np.float32(1.44), | |
pitch=np.float32(1.44), | |
), | |
Mora( | |
text="リ", | |
consonant="r", | |
consonant_length=np.float32(3.06), | |
vowel="i", | |
vowel_length=np.float32(2.31), | |
pitch=np.float32(4.44), | |
), | |
Mora( | |
text="マ", | |
consonant="m", | |
consonant_length=np.float32(2.62), | |
vowel="a", | |
vowel_length=np.float32(1.44), | |
pitch=np.float32(3.12), | |
), | |
Mora( | |
text="ス", | |
consonant="s", | |
consonant_length=np.float32(3.19), | |
vowel="U", | |
vowel_length=np.float32(1.38), | |
pitch=np.float32(0.0), | |
), | |
Mora( | |
text="カ", | |
consonant="k", | |
consonant_length=np.float32(2.44), | |
vowel="a", | |
vowel_length=np.float32(1.44), | |
pitch=np.float32(2.94), | |
), | |
], | |
accent=3, | |
pause_mora=None, | |
is_interrogative=False, | |
), | |
] | |
def create_synthesis_test_base(text: str) -> list[AccentPhrase]: | |
tts_engine = TTSEngine(core=MockCoreWrapper()) | |
return tts_engine.create_accent_phrases(text, StyleId(1)) | |
def test_create_accent_phrases() -> None: | |
"""accent_phrasesの作成時では疑問文モーラ処理を行わない | |
(https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866) | |
""" | |
tts_engine = TTSEngine(core=MockCoreWrapper()) | |
text = "これはありますか?" | |
expected = koreha_arimasuka_base_expected() | |
expected[-1].is_interrogative = True | |
actual = tts_engine.create_accent_phrases(text, StyleId(1)) | |
assert expected == actual, f"case(text:{text})" | |
def test_upspeak_voiced_last_mora() -> None: | |
# voiced + "?" + flagON -> upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="これはありますか?") | |
# Expects | |
expected = koreha_arimasuka_base_expected() | |
expected[-1].is_interrogative = True | |
expected[-1].moras += [ | |
Mora( | |
text="ア", | |
consonant=None, | |
consonant_length=None, | |
vowel="a", | |
vowel_length=0.15, | |
pitch=np.float32(expected[-1].moras[-1].pitch) + 0.3, | |
) | |
] | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, True) | |
# Test | |
assert expected == outputs | |
# voiced + "?" + flagOFF -> non-upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="これはありますか?") | |
# Expects | |
expected = koreha_arimasuka_base_expected() | |
expected[-1].is_interrogative = True | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, False) | |
# Test | |
assert expected == outputs | |
# voiced + "" + flagON -> non-upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="これはありますか") | |
# Expects | |
expected = koreha_arimasuka_base_expected() | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, True) | |
# Test | |
assert expected == outputs | |
def test_upspeak_voiced_N_last_mora() -> None: | |
def nn_base_expected() -> list[AccentPhrase]: | |
return [ | |
AccentPhrase( | |
moras=[ | |
Mora( | |
text="ン", | |
consonant=None, | |
consonant_length=None, | |
vowel="N", | |
vowel_length=np.float32(1.25), | |
pitch=np.float32(1.44), | |
) | |
], | |
accent=1, | |
pause_mora=None, | |
is_interrogative=False, | |
) | |
] | |
# voiced + "" + flagON -> upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="ん") | |
# Expects | |
expected = nn_base_expected() | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, True) | |
# Test | |
assert expected == outputs | |
# voiced + "?" + flagON -> upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="ん?") | |
# Expects | |
expected = nn_base_expected() | |
expected[-1].is_interrogative = True | |
expected[-1].moras += [ | |
Mora( | |
text="ン", | |
consonant=None, | |
consonant_length=None, | |
vowel="N", | |
vowel_length=0.15, | |
pitch=np.float32(expected[-1].moras[-1].pitch) + 0.3, | |
) | |
] | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, True) | |
# Test | |
assert expected == outputs | |
# voiced + "?" + flagOFF -> non-upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="ん?") | |
# Expects | |
expected = nn_base_expected() | |
expected[-1].is_interrogative = True | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, False) | |
# Test | |
assert expected == outputs | |
def test_upspeak_unvoiced_last_mora() -> None: | |
def ltu_base_expected() -> list[AccentPhrase]: | |
return [ | |
AccentPhrase( | |
moras=[ | |
Mora( | |
text="ッ", | |
consonant=None, | |
consonant_length=None, | |
vowel="cl", | |
vowel_length=np.float32(1.69), | |
pitch=np.float32(0.0), | |
) | |
], | |
accent=1, | |
pause_mora=None, | |
is_interrogative=False, | |
) | |
] | |
# unvoiced + "" + flagON -> non-upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="っ") | |
# Expects | |
expected = ltu_base_expected() | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, True) | |
# Test | |
assert expected == outputs | |
# unvoiced + "?" + flagON -> non-upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="っ?") | |
# Expects | |
expected = ltu_base_expected() | |
expected[-1].is_interrogative = True | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, True) | |
# Test | |
assert expected == outputs | |
# unvoiced + "?" + flagOFF -> non-upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="っ?") | |
# Expects | |
expected = ltu_base_expected() | |
expected[-1].is_interrogative = True | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, False) | |
# Test | |
assert expected == outputs | |
def test_upspeak_voiced_u_last_mora() -> None: | |
def su_base_expected() -> list[AccentPhrase]: | |
return [ | |
AccentPhrase( | |
moras=[ | |
Mora( | |
text="ス", | |
consonant="s", | |
consonant_length=np.float32(3.19), | |
vowel="u", | |
vowel_length=np.float32(3.5), | |
pitch=np.float32(5.94), | |
) | |
], | |
accent=1, | |
pause_mora=None, | |
is_interrogative=False, | |
) | |
] | |
# voiced + "" + flagON -> non-upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="す") | |
# Expects | |
expected = su_base_expected() | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, True) | |
# Test | |
assert expected == outputs | |
# voiced + "?" + flagON -> upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="す?") | |
# Expects | |
expected = su_base_expected() | |
expected[-1].is_interrogative = True | |
expected[-1].moras += [ | |
Mora( | |
text="ウ", | |
consonant=None, | |
consonant_length=None, | |
vowel="u", | |
vowel_length=0.15, | |
pitch=expected[-1].moras[-1].pitch + 0.3, | |
) | |
] | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, True) | |
# Test | |
assert expected == outputs | |
# voiced + "?" + flagOFF -> non-upspeak | |
# Inputs | |
inputs = create_synthesis_test_base(text="す?") | |
# Expects | |
expected = su_base_expected() | |
expected[-1].is_interrogative = True | |
# Outputs | |
outputs = _apply_interrogative_upspeak(inputs, False) | |
# Test | |
assert expected == outputs | |