VOICEVOX-Engine

Running

File size: 14,674 Bytes

edc06cb

from voicevox_engine.tts_pipeline.model import AccentPhrase, Mora
from voicevox_engine.tts_pipeline.text_analyzer import (
    AccentPhraseLabel,
    BreathGroupLabel,
    Label,
    MoraLabel,
    UtteranceLabel,
    mora_to_text,
    text_to_accent_phrases,
)


def contexts_to_feature(contexts: dict[str, str]) -> str:
    """ラベルの contexts を feature へ変換する"""
    return (
        "{p1}^{p2}-{p3}+{p4}={p5}"
        "/A:{a1}+{a2}+{a3}"
        "/B:{b1}-{b2}_{b3}"
        "/C:{c1}_{c2}+{c3}"
        "/D:{d1}+{d2}_{d3}"
        "/E:{e1}_{e2}!{e3}_{e4}-{e5}"
        "/F:{f1}_{f2}#{f3}_{f4}@{f5}_{f6}|{f7}_{f8}"
        "/G:{g1}_{g2}%{g3}_{g4}_{g5}"
        "/H:{h1}_{h2}"
        "/I:{i1}-{i2}@{i3}+{i4}&{i5}-{i6}|{i7}+{i8}"
        "/J:{j1}_{j2}"
        "/K:{k1}+{k2}-{k3}"
    ).format(**contexts)


# OpenJTalk コンテナクラス
OjtContainer = MoraLabel | AccentPhraseLabel | BreathGroupLabel | UtteranceLabel


def features(ojt_container: OjtContainer) -> list[str]:
    """コンテナインスタンスに直接的・間接的に含まれる全ての feature を返す"""
    return [contexts_to_feature(p.contexts) for p in ojt_container.labels]


# pyopenjtalk.extract_fullcontext("こんにちは、ヒホです。")の結果
# 出来る限りテスト内で他のライブラリに依存しないため、
# またテスト内容を透明化するために、テストケースを生成している
test_case_hello_hiho = [
    # sil (無音)
    "xx^xx-sil+k=o/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
    + "/F:xx_xx#xx_xx@xx_xx|xx_xx/G:5_5%0_xx_xx/H:xx_xx/I:xx-xx"
    + "@xx+xx&xx-xx|xx+xx/J:1_5/K:2+2-9",
    # k
    "xx^sil-k+o=N/A:-4+1+5/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
    + "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
    + "@1+2&1-2|1+9/J:1_4/K:2+2-9",
    # o
    "sil^k-o+N=n/A:-4+1+5/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
    + "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
    + "@1+2&1-2|1+9/J:1_4/K:2+2-9",
    # N (ん)
    "k^o-N+n=i/A:-3+2+4/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
    + "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
    + "@1+2&1-2|1+9/J:1_4/K:2+2-9",
    # n
    "o^N-n+i=ch/A:-2+3+3/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
    + "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
    + "@1+2&1-2|1+9/J:1_4/K:2+2-9",
    # i
    "N^n-i+ch=i/A:-2+3+3/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
    + "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
    + "@1+2&1-2|1+9/J:1_4/K:2+2-9",
    # ch
    "n^i-ch+i=w/A:-1+4+2/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
    + "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
    + "@1+2&1-2|1+9/J:1_4/K:2+2-9",
    # i
    "i^ch-i+w=a/A:-1+4+2/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
    + "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
    + "@1+2&1-2|1+9/J:1_4/K:2+2-9",
    # w
    "ch^i-w+a=pau/A:0+5+1/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
    + "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
    + "@1+2&1-2|1+9/J:1_4/K:2+2-9",
    # a
    "i^w-a+pau=h/A:0+5+1/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
    + "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
    + "@1+2&1-2|1+9/J:1_4/K:2+2-9",
    # pau (読点)
    "w^a-pau+h=i/A:xx+xx+xx/B:09-xx_xx/C:xx_xx+xx/D:09+xx_xx/E:5_5!0_xx-xx"
    + "/F:xx_xx#xx_xx@xx_xx|xx_xx/G:4_1%0_xx_xx/H:1_5/I:xx-xx"
    + "@xx+xx&xx-xx|xx+xx/J:1_4/K:2+2-9",
    # h
    "a^pau-h+i=h/A:0+1+4/B:09-xx_xx/C:09_xx+xx/D:22+xx_xx/E:5_5!0_xx-0"
    + "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
    + "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
    # i
    "pau^h-i+h=o/A:0+1+4/B:09-xx_xx/C:09_xx+xx/D:22+xx_xx/E:5_5!0_xx-0"
    + "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
    + "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
    # h
    "h^i-h+o=d/A:1+2+3/B:09-xx_xx/C:22_xx+xx/D:10+7_2/E:5_5!0_xx-0"
    + "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
    + "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
    # o
    "i^h-o+d=e/A:1+2+3/B:09-xx_xx/C:22_xx+xx/D:10+7_2/E:5_5!0_xx-0"
    + "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
    + "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
    # d
    "h^o-d+e=s/A:2+3+2/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0"
    + "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
    + "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
    # e
    "o^d-e+s=U/A:2+3+2/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0"
    + "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
    + "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
    # s
    "d^e-s+U=sil/A:3+4+1/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0"
    + "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
    + "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
    # U (無声母音)
    "e^s-U+sil=xx/A:3+4+1/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0"
    + "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
    + "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
    # sil (無音)
    "s^U-sil+xx=xx/A:xx+xx+xx/B:10-7_2/C:xx_xx+xx/D:xx+xx_xx/E:4_1!0_xx-xx"
    + "/F:xx_xx#xx_xx@xx_xx|xx_xx/G:xx_xx%xx_xx_xx/H:1_4/I:xx-xx"
    + "@xx+xx&xx-xx|xx+xx/J:xx_xx/K:2+2-9",
]
labels_hello_hiho = [Label.from_feature(feature) for feature in test_case_hello_hiho]


def jointed_phonemes(ojt_container: OjtContainer) -> str:
    """コンテナインスタンスに直接的・間接的に含まれる全ラベルの音素文字を結合してを返す"""
    return "".join([label.phoneme for label in ojt_container.labels])


def space_jointed_phonemes(ojt_container: OjtContainer) -> str:
    """コンテナインスタンスに直接的・間接的に含まれる全ラベルの音素文字を ` ` 挟みながら結合してを返す"""
    return " ".join([label.phoneme for label in ojt_container.labels])


def test_label_phoneme() -> None:
    """Label に含まれる音素をテスト"""
    assert (
        " ".join([label.phoneme for label in labels_hello_hiho])
        == "sil k o N n i ch i w a pau h i h o d e s U sil"
    )


def test_label_is_pause() -> None:
    """Label のポーズ判定をテスト"""
    assert [label.is_pause() for label in labels_hello_hiho] == [
        True,  # sil
        False,  # k
        False,  # o
        False,  # N
        False,  # n
        False,  # i
        False,  # ch
        False,  # i
        False,  # w
        False,  # a
        True,  # pau
        False,  # h
        False,  # i
        False,  # h
        False,  # o
        False,  # d
        False,  # e
        False,  # s
        False,  # u
        True,  # sil
    ]


def test_label_feature() -> None:
    """Label に含まれる features をテスト"""
    assert [
        contexts_to_feature(label.contexts) for label in labels_hello_hiho
    ] == test_case_hello_hiho


# contexts["a2"] == "1" ko
mora_hello_1 = MoraLabel(consonant=labels_hello_hiho[1], vowel=labels_hello_hiho[2])
# contexts["a2"] == "2" N
mora_hello_2 = MoraLabel(consonant=None, vowel=labels_hello_hiho[3])
# contexts["a2"] == "3" ni
mora_hello_3 = MoraLabel(consonant=labels_hello_hiho[4], vowel=labels_hello_hiho[5])
# contexts["a2"] == "4" chi
mora_hello_4 = MoraLabel(consonant=labels_hello_hiho[6], vowel=labels_hello_hiho[7])
# contexts["a2"] == "5" wa
mora_hello_5 = MoraLabel(consonant=labels_hello_hiho[8], vowel=labels_hello_hiho[9])
# contexts["a2"] == "1" hi
mora_hiho_1 = MoraLabel(consonant=labels_hello_hiho[11], vowel=labels_hello_hiho[12])
# contexts["a2"] == "2" ho
mora_hiho_2 = MoraLabel(consonant=labels_hello_hiho[13], vowel=labels_hello_hiho[14])
# contexts["a2"] == "3" de
mora_hiho_3 = MoraLabel(consonant=labels_hello_hiho[15], vowel=labels_hello_hiho[16])
# contexts["a2"] == "1" sU
mora_hiho_4 = MoraLabel(consonant=labels_hello_hiho[17], vowel=labels_hello_hiho[18])


def test_mora_label_phonemes() -> None:
    """MoraLabel に含まれる音素系列をテスト"""
    assert jointed_phonemes(mora_hello_1) == "ko"
    assert jointed_phonemes(mora_hello_2) == "N"
    assert jointed_phonemes(mora_hello_3) == "ni"
    assert jointed_phonemes(mora_hello_4) == "chi"
    assert jointed_phonemes(mora_hello_5) == "wa"
    assert jointed_phonemes(mora_hiho_1) == "hi"
    assert jointed_phonemes(mora_hiho_2) == "ho"
    assert jointed_phonemes(mora_hiho_3) == "de"
    assert jointed_phonemes(mora_hiho_4) == "sU"


def test_mora_label_features() -> None:
    """MoraLabel に含まれる features をテスト"""
    expects = test_case_hello_hiho
    assert features(mora_hello_1) == expects[1:3]
    assert features(mora_hello_2) == expects[3:4]
    assert features(mora_hello_3) == expects[4:6]
    assert features(mora_hello_4) == expects[6:8]
    assert features(mora_hello_5) == expects[8:10]
    assert features(mora_hiho_1) == expects[11:13]
    assert features(mora_hiho_2) == expects[13:15]
    assert features(mora_hiho_3) == expects[15:17]
    assert features(mora_hiho_4) == expects[17:19]


# TODO: ValueErrorを吐く作為的ではない自然な例の模索
# 存在しないなら放置でよい
accent_phrase_hello = AccentPhraseLabel.from_labels(labels_hello_hiho[1:10])
accent_phrase_hiho = AccentPhraseLabel.from_labels(labels_hello_hiho[11:19])


def test_accent_phrase_accent() -> None:
    """AccentPhraseLabel に含まれるアクセント位置をテスト"""
    assert accent_phrase_hello.accent == 5
    assert accent_phrase_hiho.accent == 1


def test_accent_phrase_phonemes() -> None:
    """AccentPhraseLabel に含まれる音素系列をテスト"""
    outputs_hello = space_jointed_phonemes(accent_phrase_hello)
    outputs_hiho = space_jointed_phonemes(accent_phrase_hiho)
    assert outputs_hello == "k o N n i ch i w a"
    assert outputs_hiho == "h i h o d e s U"


def test_accent_phrase_features() -> None:
    """AccentPhraseLabel に含まれる features をテスト"""
    expects = test_case_hello_hiho
    assert features(accent_phrase_hello) == expects[1:10]
    assert features(accent_phrase_hiho) == expects[11:19]


breath_group_hello = BreathGroupLabel.from_labels(labels_hello_hiho[1:10])
breath_group_hiho = BreathGroupLabel.from_labels(labels_hello_hiho[11:19])


def test_breath_group_phonemes() -> None:
    """BreathGroupLabel に含まれる音素系列をテスト"""
    outputs_hello = space_jointed_phonemes(breath_group_hello)
    outputs_hiho = space_jointed_phonemes(breath_group_hiho)
    assert outputs_hello == "k o N n i ch i w a"
    assert outputs_hiho == "h i h o d e s U"


def test_breath_group_features() -> None:
    """BreathGroupLabel に含まれる features をテスト"""
    expects = test_case_hello_hiho
    assert features(breath_group_hello) == expects[1:10]
    assert features(breath_group_hiho) == expects[11:19]


utterance_hello_hiho = UtteranceLabel.from_labels(labels_hello_hiho)


def test_utterance_phonemes() -> None:
    """UtteranceLabel に含まれる音素系列をテスト"""
    outputs_hello_hiho = space_jointed_phonemes(utterance_hello_hiho)
    expects_hello_hiho = "sil k o N n i ch i w a pau h i h o d e s U sil"
    assert outputs_hello_hiho == expects_hello_hiho


def test_utterance_features() -> None:
    """UtteranceLabel に含まれる features をテスト"""
    assert features(utterance_hello_hiho) == test_case_hello_hiho


def test_voice() -> None:
    assert mora_to_text("a") == "ア"
    assert mora_to_text("i") == "イ"
    assert mora_to_text("ka") == "カ"
    assert mora_to_text("N") == "ン"
    assert mora_to_text("cl") == "ッ"
    assert mora_to_text("gye") == "ギェ"
    assert mora_to_text("ye") == "イェ"
    assert mora_to_text("wo") == "ウォ"


def test_unvoice() -> None:
    assert mora_to_text("A") == "ア"
    assert mora_to_text("I") == "イ"
    assert mora_to_text("kA") == "カ"
    assert mora_to_text("gyE") == "ギェ"
    assert mora_to_text("yE") == "イェ"
    assert mora_to_text("wO") == "ウォ"


def test_invalid_mora() -> None:
    """変なモーラが来ても例外を投げない"""
    assert mora_to_text("x") == "x"
    assert mora_to_text("") == ""


def _gen_mora(text: str, consonant: str | None, vowel: str) -> Mora:
    return Mora(
        text=text,
        consonant=consonant,
        consonant_length=0 if consonant else None,
        vowel=vowel,
        vowel_length=0,
        pitch=0,
    )


def test_text_to_accent_phrases_normal() -> None:
    """`text_to_accent_phrases` は正常な日本語文をパースする"""
    # Inputs
    text = "こんにちは、ヒホです。"
    # Expects
    true_accent_phrases = [
        AccentPhrase(
            moras=[
                _gen_mora("コ", "k", "o"),
                _gen_mora("ン", None, "N"),
                _gen_mora("ニ", "n", "i"),
                _gen_mora("チ", "ch", "i"),
                _gen_mora("ワ", "w", "a"),
            ],
            accent=5,
            pause_mora=_gen_mora("、", None, "pau"),
        ),
        AccentPhrase(
            moras=[
                _gen_mora("ヒ", "h", "i"),
                _gen_mora("ホ", "h", "o"),
                _gen_mora("デ", "d", "e"),
                _gen_mora("ス", "s", "U"),
            ],
            accent=1,
            pause_mora=None,
        ),
    ]
    # Outputs
    accent_phrases = text_to_accent_phrases(text)
    # Tests
    assert accent_phrases == true_accent_phrases


def stub_unknown_features_koxx(_: str) -> list[str]:
    """`sil-k-o-xx-sil` に相当する features を常に返す `text_to_features()` のStub"""
    return [
        ".^.-sil+.=./A:.+xx+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:xx_xx#xx_.@xx_.|._./G:._.%._._./H:._./I:.-.@xx+.&.-.|.+./J:._./K:.+.-.",
        ".^.-k+.=./A:.+1+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:2_1#0_.@1_.|._./G:._.%._._./H:._./I:.-.@1+.&.-.|.+./J:._./K:.+.-.",
        ".^.-o+.=./A:.+1+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:2_1#0_.@1_.|._./G:._.%._._./H:._./I:.-.@1+.&.-.|.+./J:._./K:.+.-.",
        ".^.-xx+.=./A:.+2+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:2_1#0_.@1_.|._./G:._.%._._./H:._./I:.-.@1+.&.-.|.+./J:._./K:.+.-.",
        ".^.-sil+.=./A:.+xx+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:xx_xx#xx_.@xx_.|._./G:._.%._._./H:._./I:.-.@xx+.&.-.|.+./J:._./K:.+.-.",
    ]


def test_text_to_accent_phrases_unknown() -> None:
    """`text_to_accent_phrases` は unknown 音素を含む features をパースする"""
    # Expects
    true_accent_phrases = [
        AccentPhrase(
            moras=[
                _gen_mora("コ", "k", "o"),
                _gen_mora("xx", None, "xx"),
            ],
            accent=1,
            pause_mora=None,
        ),
    ]
    # Outputs
    accent_phrases = text_to_accent_phrases(
        "dummy", text_to_features=stub_unknown_features_koxx
    )
    # Tests
    assert accent_phrases == true_accent_phrases