Jirat Jaturanpinyo
Upload voicevox_engine
edc06cb verified
raw
history blame
22.5 kB
"""TTSEngine のテスト"""
from test.utility import pydantic_to_native_type, round_floats, summarize_big_ndarray
from unittest.mock import MagicMock
import numpy as np
import pytest
from syrupy.assertion import SnapshotAssertion
from voicevox_engine.dev.core.mock import MockCoreWrapper
from voicevox_engine.metas.Metas import StyleId
from voicevox_engine.model import AudioQuery
from voicevox_engine.tts_pipeline.model import (
AccentPhrase,
FrameAudioQuery,
Mora,
Note,
Score,
)
from voicevox_engine.tts_pipeline.text_analyzer import text_to_accent_phrases
from voicevox_engine.tts_pipeline.tts_engine import (
TTSEngine,
_apply_interrogative_upspeak,
_to_flatten_phonemes,
to_flatten_moras,
)
from .test_text_analyzer import stub_unknown_features_koxx
from .tts_utils import gen_mora, sec
def test_to_flatten_phonemes() -> None:
"""Test `_to_flatten_phonemes()`."""
# Inputs
moras = [
gen_mora(" ", None, None, "sil", sec(2), 0.0),
gen_mora("ヒ", "h", sec(2), "i", sec(4), 5.0),
gen_mora(" ", None, None, "sil", sec(6), 0.0),
]
# Expects
true_phoneme_strs = ["pau", "h", "i", "pau"]
# Outputs
phonemes = _to_flatten_phonemes(moras)
phoneme_strs = list(map(lambda p: p._phoneme, phonemes))
# Test
assert true_phoneme_strs == phoneme_strs
def _gen_hello_hiho_accent_phrases() -> list[AccentPhrase]:
return [
AccentPhrase(
moras=[
gen_mora("コ", "k", 0.1, "o", 0.1, 5.0),
gen_mora("ン", None, None, "N", 0.1, 5.0),
gen_mora("ニ", "n", 0.1, "i", 0.1, 5.0),
gen_mora("チ", "ch", 0.1, "i", 0.1, 5.0),
gen_mora("ワ", "w", 0.1, "a", 0.1, 5.0),
],
accent=5,
pause_mora=gen_mora("、", None, None, "pau", 0.1, 0.0),
),
AccentPhrase(
moras=[
gen_mora("ヒ", "h", 0.1, "i", 0.1, 0.0),
gen_mora("ホ", "h", 0.1, "o", 0.1, 5.0),
gen_mora("デ", "d", 0.1, "e", 0.1, 5.0),
gen_mora("ス", "s", 0.1, "U", 0.1, 0.0),
],
accent=1,
pause_mora=None,
),
]
def _gen_hello_hiho_query() -> AudioQuery:
return AudioQuery(
accent_phrases=_gen_hello_hiho_accent_phrases(),
speedScale=2.0,
pitchScale=1.1,
intonationScale=0.9,
volumeScale=1.3,
prePhonemeLength=0.1,
postPhonemeLength=0.2,
pauseLength=0.3,
pauseLengthScale=0.8,
outputSamplingRate=12000,
outputStereo=True,
kana="コンニチワ'、ヒ'ホデ_ス",
)
def _gen_doremi_score() -> Score:
return Score(
notes=[
Note(key=None, frame_length=10, lyric=""),
Note(key=60, frame_length=12, lyric="ど"),
Note(key=62, frame_length=17, lyric="れ"),
Note(key=64, frame_length=21, lyric="み"),
Note(key=None, frame_length=5, lyric=""),
Note(key=65, frame_length=12, lyric="ふぁ"),
Note(key=67, frame_length=17, lyric="そ"),
Note(key=None, frame_length=10, lyric=""),
]
)
def test_to_flatten_moras() -> None:
flatten_moras = to_flatten_moras(_gen_hello_hiho_accent_phrases())
true_accent_phrases_hello_hiho = _gen_hello_hiho_accent_phrases()
assert (
flatten_moras
== true_accent_phrases_hello_hiho[0].moras
+ [true_accent_phrases_hello_hiho[0].pause_mora]
+ true_accent_phrases_hello_hiho[1].moras
)
def test_update_length() -> None:
core = MockCoreWrapper()
core.yukarin_s_forward = MagicMock(wraps=core.yukarin_s_forward) # type: ignore[method-assign]
_yukarin_s_mock = core.yukarin_s_forward
tts_engine = TTSEngine(core=core)
# Inputs
hello_hiho = _gen_hello_hiho_accent_phrases()
# Indirect Outputs(yukarin_sに渡される値)
tts_engine.update_length(hello_hiho, StyleId(1))
yukarin_s_args = _yukarin_s_mock.call_args[1]
list_length = yukarin_s_args["length"]
phoneme_list = yukarin_s_args["phoneme_list"]
style_id = yukarin_s_args["style_id"]
# Expects
true_list_length = 20
true_style_id = 1
true_phoneme_list_1 = [0, 23, 30, 4, 28, 21, 10, 21, 42, 7]
true_phoneme_list_2 = [0, 19, 21, 19, 30, 12, 14, 35, 6, 0]
true_phoneme_list = true_phoneme_list_1 + true_phoneme_list_2
assert list_length == true_list_length
assert list_length == len(phoneme_list)
assert style_id == true_style_id
np.testing.assert_array_equal(
phoneme_list,
np.array(true_phoneme_list, dtype=np.int64),
)
def test_update_pitch() -> None:
core = MockCoreWrapper()
core.yukarin_sa_forward = MagicMock(wraps=core.yukarin_sa_forward) # type: ignore[method-assign]
_yukarin_sa_mock = core.yukarin_sa_forward
tts_engine = TTSEngine(core=core)
# 空のリストでエラーを吐かないか
# Inputs
phrases: list = []
# Outputs
result = tts_engine.update_pitch(phrases, StyleId(1))
# Expects
true_result: list = []
# Tests
assert result == true_result
# Inputs
hello_hiho = _gen_hello_hiho_accent_phrases()
# Indirect Outputs(yukarin_saに渡される値)
tts_engine.update_pitch(hello_hiho, StyleId(1))
yukarin_sa_args = _yukarin_sa_mock.call_args[1]
list_length = yukarin_sa_args["length"]
vowel_phoneme_list = yukarin_sa_args["vowel_phoneme_list"][0]
consonant_phoneme_list = yukarin_sa_args["consonant_phoneme_list"][0]
start_accent_list = yukarin_sa_args["start_accent_list"][0]
end_accent_list = yukarin_sa_args["end_accent_list"][0]
start_accent_phrase_list = yukarin_sa_args["start_accent_phrase_list"][0]
end_accent_phrase_list = yukarin_sa_args["end_accent_phrase_list"][0]
style_id = yukarin_sa_args["style_id"]
# Expects
true_vowels = np.array([0, 30, 4, 21, 21, 7, 0, 21, 30, 14, 6, 0])
true_consonants = np.array([-1, 23, -1, 28, 10, 42, -1, 19, 19, 12, 35, -1])
true_accent_starts = np.array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0])
true_accent_ends = np.array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0])
true_phrase_starts = np.array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])
true_phrase_ends = np.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0])
# Tests
assert list_length == 12
assert list_length == len(vowel_phoneme_list)
assert list_length == len(consonant_phoneme_list)
assert list_length == len(start_accent_list)
assert list_length == len(end_accent_list)
assert list_length == len(start_accent_phrase_list)
assert list_length == len(end_accent_phrase_list)
assert style_id == 1
np.testing.assert_array_equal(vowel_phoneme_list, true_vowels)
np.testing.assert_array_equal(consonant_phoneme_list, true_consonants)
np.testing.assert_array_equal(start_accent_list, true_accent_starts)
np.testing.assert_array_equal(end_accent_list, true_accent_ends)
np.testing.assert_array_equal(start_accent_phrase_list, true_phrase_starts)
np.testing.assert_array_equal(end_accent_phrase_list, true_phrase_ends)
def test_create_accent_phrases_toward_unknown() -> None:
"""`TTSEngine.create_accent_phrases()` は unknown 音素の Phoneme 化に失敗する"""
engine = TTSEngine(MockCoreWrapper())
# NOTE: TTSEngine.create_accent_phrases() のコールで unknown feature を得ることが難しいため、疑似再現
accent_phrases = text_to_accent_phrases(
"dummy", text_to_features=stub_unknown_features_koxx
)
with pytest.raises(ValueError) as e:
accent_phrases = engine.update_length_and_pitch(accent_phrases, StyleId(0))
assert str(e.value) == "tuple.index(x): x not in tuple"
def test_mocked_update_length_output(snapshot_json: SnapshotAssertion) -> None:
"""モックされた `TTSEngine.update_length()` の出力スナップショットが一定である"""
# Inputs
tts_engine = TTSEngine(MockCoreWrapper())
hello_hiho = _gen_hello_hiho_accent_phrases()
# Outputs
result = tts_engine.update_length(hello_hiho, StyleId(1))
# Tests
assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
def test_mocked_update_pitch_output(snapshot_json: SnapshotAssertion) -> None:
"""モックされた `TTSEngine.update_pitch()` の出力スナップショットが一定である"""
# Inputs
tts_engine = TTSEngine(MockCoreWrapper())
hello_hiho = _gen_hello_hiho_accent_phrases()
# Outputs
result = tts_engine.update_pitch(hello_hiho, StyleId(1))
# Tests
assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
def test_mocked_update_length_and_pitch_output(
snapshot_json: SnapshotAssertion,
) -> None:
"""モックされた `TTSEngine.update_length_and_pitch()` の出力スナップショットが一定である"""
# Inputs
tts_engine = TTSEngine(MockCoreWrapper())
hello_hiho = _gen_hello_hiho_accent_phrases()
# Outputs
result = tts_engine.update_length_and_pitch(hello_hiho, StyleId(1))
# Tests
assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
def test_mocked_create_accent_phrases_output(
snapshot_json: SnapshotAssertion,
) -> None:
"""モックされた `TTSEngine.create_accent_phrases()` の出力スナップショットが一定である"""
# Inputs
tts_engine = TTSEngine(MockCoreWrapper())
hello_hiho = "こんにちは、ヒホです"
# Outputs
result = tts_engine.create_accent_phrases(hello_hiho, StyleId(1))
# Tests
assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
def test_mocked_create_accent_phrases_from_kana_output(
snapshot_json: SnapshotAssertion,
) -> None:
"""モックされた `TTSEngine.create_accent_phrases_from_kana()` の出力スナップショットが一定である"""
# Inputs
tts_engine = TTSEngine(MockCoreWrapper())
hello_hiho = "コンニチワ'、ヒ'ホデ_ス"
# Outputs
result = tts_engine.create_accent_phrases_from_kana(hello_hiho, StyleId(1))
# Tests
assert snapshot_json == round_floats(pydantic_to_native_type(result), round_value=2)
def test_mocked_synthesize_wave_output(snapshot_json: SnapshotAssertion) -> None:
"""モックされた `TTSEngine.synthesize_wave()` の出力スナップショットが一定である"""
# Inputs
tts_engine = TTSEngine(MockCoreWrapper())
hello_hiho = _gen_hello_hiho_query()
# Outputs
result = tts_engine.synthesize_wave(hello_hiho, StyleId(1))
# Tests
assert snapshot_json == summarize_big_ndarray(round_floats(result, round_value=2))
def test_mocked_create_sing_volume_from_phoneme_and_f0_output(
snapshot_json: SnapshotAssertion,
) -> None:
"""
モックされた `TTSEngine.create_sing_phoneme_and_f0_and_volume()` の出力スナップショットが一定である
NOTE: 入力生成の簡略化に別関数を呼び出すため、別関数が正しく動作しない場合テストが落ちる
"""
# Inputs
tts_engine = TTSEngine(MockCoreWrapper())
doremi_srore = _gen_doremi_score()
phonemes, f0s, _ = tts_engine.create_sing_phoneme_and_f0_and_volume(
doremi_srore, StyleId(1)
)
# Outputs
result = tts_engine.create_sing_volume_from_phoneme_and_f0(
doremi_srore, phonemes, f0s, StyleId(1)
)
# Tests
assert snapshot_json == round_floats(result, round_value=2)
def test_mocked_synthesize_wave_from_score_output(
snapshot_json: SnapshotAssertion,
) -> None:
"""
モックされた `TTSEngine.create_sing_phoneme_and_f0_and_volume()` と
`TTSEngine.frame_synthsize_wave()` の出力スナップショットが一定である
"""
# Inputs
tts_engine = TTSEngine(MockCoreWrapper())
doremi_srore = _gen_doremi_score()
# Outputs
result = tts_engine.create_sing_phoneme_and_f0_and_volume(doremi_srore, StyleId(1))
# Tests
assert snapshot_json(name="query") == round_floats(
pydantic_to_native_type(result), round_value=2
)
# Inputs
phonemes, f0, volume = result
doremi_query = FrameAudioQuery(
f0=f0,
volume=volume,
phonemes=phonemes,
volumeScale=1.3,
outputSamplingRate=1200,
outputStereo=False,
)
# Outputs
result_wave = tts_engine.frame_synthsize_wave(doremi_query, StyleId(1))
# Tests
assert snapshot_json(name="wave") == round_floats(
result_wave.tolist(), round_value=2
)
def koreha_arimasuka_base_expected() -> list[AccentPhrase]:
return [
AccentPhrase(
moras=[
Mora(
text="コ",
consonant="k",
consonant_length=np.float32(2.44),
vowel="o",
vowel_length=np.float32(2.88),
pitch=np.float32(4.38),
),
Mora(
text="レ",
consonant="r",
consonant_length=np.float32(3.06),
vowel="e",
vowel_length=np.float32(1.88),
pitch=np.float32(4.0),
),
Mora(
text="ワ",
consonant="w",
consonant_length=np.float32(3.62),
vowel="a",
vowel_length=np.float32(1.44),
pitch=np.float32(4.19),
),
],
accent=3,
pause_mora=None,
is_interrogative=False,
),
AccentPhrase(
moras=[
Mora(
text="ア",
consonant=None,
consonant_length=None,
vowel="a",
vowel_length=np.float32(1.44),
pitch=np.float32(1.44),
),
Mora(
text="リ",
consonant="r",
consonant_length=np.float32(3.06),
vowel="i",
vowel_length=np.float32(2.31),
pitch=np.float32(4.44),
),
Mora(
text="マ",
consonant="m",
consonant_length=np.float32(2.62),
vowel="a",
vowel_length=np.float32(1.44),
pitch=np.float32(3.12),
),
Mora(
text="ス",
consonant="s",
consonant_length=np.float32(3.19),
vowel="U",
vowel_length=np.float32(1.38),
pitch=np.float32(0.0),
),
Mora(
text="カ",
consonant="k",
consonant_length=np.float32(2.44),
vowel="a",
vowel_length=np.float32(1.44),
pitch=np.float32(2.94),
),
],
accent=3,
pause_mora=None,
is_interrogative=False,
),
]
def create_synthesis_test_base(text: str) -> list[AccentPhrase]:
tts_engine = TTSEngine(core=MockCoreWrapper())
return tts_engine.create_accent_phrases(text, StyleId(1))
def test_create_accent_phrases() -> None:
"""accent_phrasesの作成時では疑問文モーラ処理を行わない
(https://github.com/VOICEVOX/voicevox_engine/issues/272#issuecomment-1022610866)
"""
tts_engine = TTSEngine(core=MockCoreWrapper())
text = "これはありますか?"
expected = koreha_arimasuka_base_expected()
expected[-1].is_interrogative = True
actual = tts_engine.create_accent_phrases(text, StyleId(1))
assert expected == actual, f"case(text:{text})"
def test_upspeak_voiced_last_mora() -> None:
# voiced + "?" + flagON -> upspeak
# Inputs
inputs = create_synthesis_test_base(text="これはありますか?")
# Expects
expected = koreha_arimasuka_base_expected()
expected[-1].is_interrogative = True
expected[-1].moras += [
Mora(
text="ア",
consonant=None,
consonant_length=None,
vowel="a",
vowel_length=0.15,
pitch=np.float32(expected[-1].moras[-1].pitch) + 0.3,
)
]
# Outputs
outputs = _apply_interrogative_upspeak(inputs, True)
# Test
assert expected == outputs
# voiced + "?" + flagOFF -> non-upspeak
# Inputs
inputs = create_synthesis_test_base(text="これはありますか?")
# Expects
expected = koreha_arimasuka_base_expected()
expected[-1].is_interrogative = True
# Outputs
outputs = _apply_interrogative_upspeak(inputs, False)
# Test
assert expected == outputs
# voiced + "" + flagON -> non-upspeak
# Inputs
inputs = create_synthesis_test_base(text="これはありますか")
# Expects
expected = koreha_arimasuka_base_expected()
# Outputs
outputs = _apply_interrogative_upspeak(inputs, True)
# Test
assert expected == outputs
def test_upspeak_voiced_N_last_mora() -> None:
def nn_base_expected() -> list[AccentPhrase]:
return [
AccentPhrase(
moras=[
Mora(
text="ン",
consonant=None,
consonant_length=None,
vowel="N",
vowel_length=np.float32(1.25),
pitch=np.float32(1.44),
)
],
accent=1,
pause_mora=None,
is_interrogative=False,
)
]
# voiced + "" + flagON -> upspeak
# Inputs
inputs = create_synthesis_test_base(text="ん")
# Expects
expected = nn_base_expected()
# Outputs
outputs = _apply_interrogative_upspeak(inputs, True)
# Test
assert expected == outputs
# voiced + "?" + flagON -> upspeak
# Inputs
inputs = create_synthesis_test_base(text="ん?")
# Expects
expected = nn_base_expected()
expected[-1].is_interrogative = True
expected[-1].moras += [
Mora(
text="ン",
consonant=None,
consonant_length=None,
vowel="N",
vowel_length=0.15,
pitch=np.float32(expected[-1].moras[-1].pitch) + 0.3,
)
]
# Outputs
outputs = _apply_interrogative_upspeak(inputs, True)
# Test
assert expected == outputs
# voiced + "?" + flagOFF -> non-upspeak
# Inputs
inputs = create_synthesis_test_base(text="ん?")
# Expects
expected = nn_base_expected()
expected[-1].is_interrogative = True
# Outputs
outputs = _apply_interrogative_upspeak(inputs, False)
# Test
assert expected == outputs
def test_upspeak_unvoiced_last_mora() -> None:
def ltu_base_expected() -> list[AccentPhrase]:
return [
AccentPhrase(
moras=[
Mora(
text="ッ",
consonant=None,
consonant_length=None,
vowel="cl",
vowel_length=np.float32(1.69),
pitch=np.float32(0.0),
)
],
accent=1,
pause_mora=None,
is_interrogative=False,
)
]
# unvoiced + "" + flagON -> non-upspeak
# Inputs
inputs = create_synthesis_test_base(text="っ")
# Expects
expected = ltu_base_expected()
# Outputs
outputs = _apply_interrogative_upspeak(inputs, True)
# Test
assert expected == outputs
# unvoiced + "?" + flagON -> non-upspeak
# Inputs
inputs = create_synthesis_test_base(text="っ?")
# Expects
expected = ltu_base_expected()
expected[-1].is_interrogative = True
# Outputs
outputs = _apply_interrogative_upspeak(inputs, True)
# Test
assert expected == outputs
# unvoiced + "?" + flagOFF -> non-upspeak
# Inputs
inputs = create_synthesis_test_base(text="っ?")
# Expects
expected = ltu_base_expected()
expected[-1].is_interrogative = True
# Outputs
outputs = _apply_interrogative_upspeak(inputs, False)
# Test
assert expected == outputs
def test_upspeak_voiced_u_last_mora() -> None:
def su_base_expected() -> list[AccentPhrase]:
return [
AccentPhrase(
moras=[
Mora(
text="ス",
consonant="s",
consonant_length=np.float32(3.19),
vowel="u",
vowel_length=np.float32(3.5),
pitch=np.float32(5.94),
)
],
accent=1,
pause_mora=None,
is_interrogative=False,
)
]
# voiced + "" + flagON -> non-upspeak
# Inputs
inputs = create_synthesis_test_base(text="す")
# Expects
expected = su_base_expected()
# Outputs
outputs = _apply_interrogative_upspeak(inputs, True)
# Test
assert expected == outputs
# voiced + "?" + flagON -> upspeak
# Inputs
inputs = create_synthesis_test_base(text="す?")
# Expects
expected = su_base_expected()
expected[-1].is_interrogative = True
expected[-1].moras += [
Mora(
text="ウ",
consonant=None,
consonant_length=None,
vowel="u",
vowel_length=0.15,
pitch=expected[-1].moras[-1].pitch + 0.3,
)
]
# Outputs
outputs = _apply_interrogative_upspeak(inputs, True)
# Test
assert expected == outputs
# voiced + "?" + flagOFF -> non-upspeak
# Inputs
inputs = create_synthesis_test_base(text="す?")
# Expects
expected = su_base_expected()
expected[-1].is_interrogative = True
# Outputs
outputs = _apply_interrogative_upspeak(inputs, False)
# Test
assert expected == outputs