Jirat Jaturanpinyo
Upload voicevox_engine
edc06cb verified
raw
history blame
14.7 kB
from voicevox_engine.tts_pipeline.model import AccentPhrase, Mora
from voicevox_engine.tts_pipeline.text_analyzer import (
AccentPhraseLabel,
BreathGroupLabel,
Label,
MoraLabel,
UtteranceLabel,
mora_to_text,
text_to_accent_phrases,
)
def contexts_to_feature(contexts: dict[str, str]) -> str:
"""ラベルの contexts を feature へ変換する"""
return (
"{p1}^{p2}-{p3}+{p4}={p5}"
"/A:{a1}+{a2}+{a3}"
"/B:{b1}-{b2}_{b3}"
"/C:{c1}_{c2}+{c3}"
"/D:{d1}+{d2}_{d3}"
"/E:{e1}_{e2}!{e3}_{e4}-{e5}"
"/F:{f1}_{f2}#{f3}_{f4}@{f5}_{f6}|{f7}_{f8}"
"/G:{g1}_{g2}%{g3}_{g4}_{g5}"
"/H:{h1}_{h2}"
"/I:{i1}-{i2}@{i3}+{i4}&{i5}-{i6}|{i7}+{i8}"
"/J:{j1}_{j2}"
"/K:{k1}+{k2}-{k3}"
).format(**contexts)
# OpenJTalk コンテナクラス
OjtContainer = MoraLabel | AccentPhraseLabel | BreathGroupLabel | UtteranceLabel
def features(ojt_container: OjtContainer) -> list[str]:
"""コンテナインスタンスに直接的・間接的に含まれる全ての feature を返す"""
return [contexts_to_feature(p.contexts) for p in ojt_container.labels]
# pyopenjtalk.extract_fullcontext("こんにちは、ヒホです。")の結果
# 出来る限りテスト内で他のライブラリに依存しないため、
# またテスト内容を透明化するために、テストケースを生成している
test_case_hello_hiho = [
# sil (無音)
"xx^xx-sil+k=o/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
+ "/F:xx_xx#xx_xx@xx_xx|xx_xx/G:5_5%0_xx_xx/H:xx_xx/I:xx-xx"
+ "@xx+xx&xx-xx|xx+xx/J:1_5/K:2+2-9",
# k
"xx^sil-k+o=N/A:-4+1+5/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9",
# o
"sil^k-o+N=n/A:-4+1+5/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9",
# N (ん)
"k^o-N+n=i/A:-3+2+4/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9",
# n
"o^N-n+i=ch/A:-2+3+3/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9",
# i
"N^n-i+ch=i/A:-2+3+3/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9",
# ch
"n^i-ch+i=w/A:-1+4+2/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9",
# i
"i^ch-i+w=a/A:-1+4+2/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9",
# w
"ch^i-w+a=pau/A:0+5+1/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9",
# a
"i^w-a+pau=h/A:0+5+1/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx"
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5"
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9",
# pau (読点)
"w^a-pau+h=i/A:xx+xx+xx/B:09-xx_xx/C:xx_xx+xx/D:09+xx_xx/E:5_5!0_xx-xx"
+ "/F:xx_xx#xx_xx@xx_xx|xx_xx/G:4_1%0_xx_xx/H:1_5/I:xx-xx"
+ "@xx+xx&xx-xx|xx+xx/J:1_4/K:2+2-9",
# h
"a^pau-h+i=h/A:0+1+4/B:09-xx_xx/C:09_xx+xx/D:22+xx_xx/E:5_5!0_xx-0"
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
# i
"pau^h-i+h=o/A:0+1+4/B:09-xx_xx/C:09_xx+xx/D:22+xx_xx/E:5_5!0_xx-0"
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
# h
"h^i-h+o=d/A:1+2+3/B:09-xx_xx/C:22_xx+xx/D:10+7_2/E:5_5!0_xx-0"
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
# o
"i^h-o+d=e/A:1+2+3/B:09-xx_xx/C:22_xx+xx/D:10+7_2/E:5_5!0_xx-0"
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
# d
"h^o-d+e=s/A:2+3+2/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0"
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
# e
"o^d-e+s=U/A:2+3+2/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0"
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
# s
"d^e-s+U=sil/A:3+4+1/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0"
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
# U (無声母音)
"e^s-U+sil=xx/A:3+4+1/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0"
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4"
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9",
# sil (無音)
"s^U-sil+xx=xx/A:xx+xx+xx/B:10-7_2/C:xx_xx+xx/D:xx+xx_xx/E:4_1!0_xx-xx"
+ "/F:xx_xx#xx_xx@xx_xx|xx_xx/G:xx_xx%xx_xx_xx/H:1_4/I:xx-xx"
+ "@xx+xx&xx-xx|xx+xx/J:xx_xx/K:2+2-9",
]
labels_hello_hiho = [Label.from_feature(feature) for feature in test_case_hello_hiho]
def jointed_phonemes(ojt_container: OjtContainer) -> str:
"""コンテナインスタンスに直接的・間接的に含まれる全ラベルの音素文字を結合してを返す"""
return "".join([label.phoneme for label in ojt_container.labels])
def space_jointed_phonemes(ojt_container: OjtContainer) -> str:
"""コンテナインスタンスに直接的・間接的に含まれる全ラベルの音素文字を ` ` 挟みながら結合してを返す"""
return " ".join([label.phoneme for label in ojt_container.labels])
def test_label_phoneme() -> None:
"""Label に含まれる音素をテスト"""
assert (
" ".join([label.phoneme for label in labels_hello_hiho])
== "sil k o N n i ch i w a pau h i h o d e s U sil"
)
def test_label_is_pause() -> None:
"""Label のポーズ判定をテスト"""
assert [label.is_pause() for label in labels_hello_hiho] == [
True, # sil
False, # k
False, # o
False, # N
False, # n
False, # i
False, # ch
False, # i
False, # w
False, # a
True, # pau
False, # h
False, # i
False, # h
False, # o
False, # d
False, # e
False, # s
False, # u
True, # sil
]
def test_label_feature() -> None:
"""Label に含まれる features をテスト"""
assert [
contexts_to_feature(label.contexts) for label in labels_hello_hiho
] == test_case_hello_hiho
# contexts["a2"] == "1" ko
mora_hello_1 = MoraLabel(consonant=labels_hello_hiho[1], vowel=labels_hello_hiho[2])
# contexts["a2"] == "2" N
mora_hello_2 = MoraLabel(consonant=None, vowel=labels_hello_hiho[3])
# contexts["a2"] == "3" ni
mora_hello_3 = MoraLabel(consonant=labels_hello_hiho[4], vowel=labels_hello_hiho[5])
# contexts["a2"] == "4" chi
mora_hello_4 = MoraLabel(consonant=labels_hello_hiho[6], vowel=labels_hello_hiho[7])
# contexts["a2"] == "5" wa
mora_hello_5 = MoraLabel(consonant=labels_hello_hiho[8], vowel=labels_hello_hiho[9])
# contexts["a2"] == "1" hi
mora_hiho_1 = MoraLabel(consonant=labels_hello_hiho[11], vowel=labels_hello_hiho[12])
# contexts["a2"] == "2" ho
mora_hiho_2 = MoraLabel(consonant=labels_hello_hiho[13], vowel=labels_hello_hiho[14])
# contexts["a2"] == "3" de
mora_hiho_3 = MoraLabel(consonant=labels_hello_hiho[15], vowel=labels_hello_hiho[16])
# contexts["a2"] == "1" sU
mora_hiho_4 = MoraLabel(consonant=labels_hello_hiho[17], vowel=labels_hello_hiho[18])
def test_mora_label_phonemes() -> None:
"""MoraLabel に含まれる音素系列をテスト"""
assert jointed_phonemes(mora_hello_1) == "ko"
assert jointed_phonemes(mora_hello_2) == "N"
assert jointed_phonemes(mora_hello_3) == "ni"
assert jointed_phonemes(mora_hello_4) == "chi"
assert jointed_phonemes(mora_hello_5) == "wa"
assert jointed_phonemes(mora_hiho_1) == "hi"
assert jointed_phonemes(mora_hiho_2) == "ho"
assert jointed_phonemes(mora_hiho_3) == "de"
assert jointed_phonemes(mora_hiho_4) == "sU"
def test_mora_label_features() -> None:
"""MoraLabel に含まれる features をテスト"""
expects = test_case_hello_hiho
assert features(mora_hello_1) == expects[1:3]
assert features(mora_hello_2) == expects[3:4]
assert features(mora_hello_3) == expects[4:6]
assert features(mora_hello_4) == expects[6:8]
assert features(mora_hello_5) == expects[8:10]
assert features(mora_hiho_1) == expects[11:13]
assert features(mora_hiho_2) == expects[13:15]
assert features(mora_hiho_3) == expects[15:17]
assert features(mora_hiho_4) == expects[17:19]
# TODO: ValueErrorを吐く作為的ではない自然な例の模索
# 存在しないなら放置でよい
accent_phrase_hello = AccentPhraseLabel.from_labels(labels_hello_hiho[1:10])
accent_phrase_hiho = AccentPhraseLabel.from_labels(labels_hello_hiho[11:19])
def test_accent_phrase_accent() -> None:
"""AccentPhraseLabel に含まれるアクセント位置をテスト"""
assert accent_phrase_hello.accent == 5
assert accent_phrase_hiho.accent == 1
def test_accent_phrase_phonemes() -> None:
"""AccentPhraseLabel に含まれる音素系列をテスト"""
outputs_hello = space_jointed_phonemes(accent_phrase_hello)
outputs_hiho = space_jointed_phonemes(accent_phrase_hiho)
assert outputs_hello == "k o N n i ch i w a"
assert outputs_hiho == "h i h o d e s U"
def test_accent_phrase_features() -> None:
"""AccentPhraseLabel に含まれる features をテスト"""
expects = test_case_hello_hiho
assert features(accent_phrase_hello) == expects[1:10]
assert features(accent_phrase_hiho) == expects[11:19]
breath_group_hello = BreathGroupLabel.from_labels(labels_hello_hiho[1:10])
breath_group_hiho = BreathGroupLabel.from_labels(labels_hello_hiho[11:19])
def test_breath_group_phonemes() -> None:
"""BreathGroupLabel に含まれる音素系列をテスト"""
outputs_hello = space_jointed_phonemes(breath_group_hello)
outputs_hiho = space_jointed_phonemes(breath_group_hiho)
assert outputs_hello == "k o N n i ch i w a"
assert outputs_hiho == "h i h o d e s U"
def test_breath_group_features() -> None:
"""BreathGroupLabel に含まれる features をテスト"""
expects = test_case_hello_hiho
assert features(breath_group_hello) == expects[1:10]
assert features(breath_group_hiho) == expects[11:19]
utterance_hello_hiho = UtteranceLabel.from_labels(labels_hello_hiho)
def test_utterance_phonemes() -> None:
"""UtteranceLabel に含まれる音素系列をテスト"""
outputs_hello_hiho = space_jointed_phonemes(utterance_hello_hiho)
expects_hello_hiho = "sil k o N n i ch i w a pau h i h o d e s U sil"
assert outputs_hello_hiho == expects_hello_hiho
def test_utterance_features() -> None:
"""UtteranceLabel に含まれる features をテスト"""
assert features(utterance_hello_hiho) == test_case_hello_hiho
def test_voice() -> None:
assert mora_to_text("a") == "ア"
assert mora_to_text("i") == "イ"
assert mora_to_text("ka") == "カ"
assert mora_to_text("N") == "ン"
assert mora_to_text("cl") == "ッ"
assert mora_to_text("gye") == "ギェ"
assert mora_to_text("ye") == "イェ"
assert mora_to_text("wo") == "ウォ"
def test_unvoice() -> None:
assert mora_to_text("A") == "ア"
assert mora_to_text("I") == "イ"
assert mora_to_text("kA") == "カ"
assert mora_to_text("gyE") == "ギェ"
assert mora_to_text("yE") == "イェ"
assert mora_to_text("wO") == "ウォ"
def test_invalid_mora() -> None:
"""変なモーラが来ても例外を投げない"""
assert mora_to_text("x") == "x"
assert mora_to_text("") == ""
def _gen_mora(text: str, consonant: str | None, vowel: str) -> Mora:
return Mora(
text=text,
consonant=consonant,
consonant_length=0 if consonant else None,
vowel=vowel,
vowel_length=0,
pitch=0,
)
def test_text_to_accent_phrases_normal() -> None:
"""`text_to_accent_phrases` は正常な日本語文をパースする"""
# Inputs
text = "こんにちは、ヒホです。"
# Expects
true_accent_phrases = [
AccentPhrase(
moras=[
_gen_mora("コ", "k", "o"),
_gen_mora("ン", None, "N"),
_gen_mora("ニ", "n", "i"),
_gen_mora("チ", "ch", "i"),
_gen_mora("ワ", "w", "a"),
],
accent=5,
pause_mora=_gen_mora("、", None, "pau"),
),
AccentPhrase(
moras=[
_gen_mora("ヒ", "h", "i"),
_gen_mora("ホ", "h", "o"),
_gen_mora("デ", "d", "e"),
_gen_mora("ス", "s", "U"),
],
accent=1,
pause_mora=None,
),
]
# Outputs
accent_phrases = text_to_accent_phrases(text)
# Tests
assert accent_phrases == true_accent_phrases
def stub_unknown_features_koxx(_: str) -> list[str]:
"""`sil-k-o-xx-sil` に相当する features を常に返す `text_to_features()` のStub"""
return [
".^.-sil+.=./A:.+xx+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:xx_xx#xx_.@xx_.|._./G:._.%._._./H:._./I:.-.@xx+.&.-.|.+./J:._./K:.+.-.",
".^.-k+.=./A:.+1+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:2_1#0_.@1_.|._./G:._.%._._./H:._./I:.-.@1+.&.-.|.+./J:._./K:.+.-.",
".^.-o+.=./A:.+1+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:2_1#0_.@1_.|._./G:._.%._._./H:._./I:.-.@1+.&.-.|.+./J:._./K:.+.-.",
".^.-xx+.=./A:.+2+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:2_1#0_.@1_.|._./G:._.%._._./H:._./I:.-.@1+.&.-.|.+./J:._./K:.+.-.",
".^.-sil+.=./A:.+xx+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:xx_xx#xx_.@xx_.|._./G:._.%._._./H:._./I:.-.@xx+.&.-.|.+./J:._./K:.+.-.",
]
def test_text_to_accent_phrases_unknown() -> None:
"""`text_to_accent_phrases` は unknown 音素を含む features をパースする"""
# Expects
true_accent_phrases = [
AccentPhrase(
moras=[
_gen_mora("コ", "k", "o"),
_gen_mora("xx", None, "xx"),
],
accent=1,
pause_mora=None,
),
]
# Outputs
accent_phrases = text_to_accent_phrases(
"dummy", text_to_features=stub_unknown_features_koxx
)
# Tests
assert accent_phrases == true_accent_phrases