Spaces:
Running
Running
from voicevox_engine.tts_pipeline.model import AccentPhrase, Mora | |
from voicevox_engine.tts_pipeline.text_analyzer import ( | |
AccentPhraseLabel, | |
BreathGroupLabel, | |
Label, | |
MoraLabel, | |
UtteranceLabel, | |
mora_to_text, | |
text_to_accent_phrases, | |
) | |
def contexts_to_feature(contexts: dict[str, str]) -> str: | |
"""ラベルの contexts を feature へ変換する""" | |
return ( | |
"{p1}^{p2}-{p3}+{p4}={p5}" | |
"/A:{a1}+{a2}+{a3}" | |
"/B:{b1}-{b2}_{b3}" | |
"/C:{c1}_{c2}+{c3}" | |
"/D:{d1}+{d2}_{d3}" | |
"/E:{e1}_{e2}!{e3}_{e4}-{e5}" | |
"/F:{f1}_{f2}#{f3}_{f4}@{f5}_{f6}|{f7}_{f8}" | |
"/G:{g1}_{g2}%{g3}_{g4}_{g5}" | |
"/H:{h1}_{h2}" | |
"/I:{i1}-{i2}@{i3}+{i4}&{i5}-{i6}|{i7}+{i8}" | |
"/J:{j1}_{j2}" | |
"/K:{k1}+{k2}-{k3}" | |
).format(**contexts) | |
# OpenJTalk コンテナクラス | |
OjtContainer = MoraLabel | AccentPhraseLabel | BreathGroupLabel | UtteranceLabel | |
def features(ojt_container: OjtContainer) -> list[str]: | |
"""コンテナインスタンスに直接的・間接的に含まれる全ての feature を返す""" | |
return [contexts_to_feature(p.contexts) for p in ojt_container.labels] | |
# pyopenjtalk.extract_fullcontext("こんにちは、ヒホです。")の結果 | |
# 出来る限りテスト内で他のライブラリに依存しないため、 | |
# またテスト内容を透明化するために、テストケースを生成している | |
test_case_hello_hiho = [ | |
# sil (無音) | |
"xx^xx-sil+k=o/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx" | |
+ "/F:xx_xx#xx_xx@xx_xx|xx_xx/G:5_5%0_xx_xx/H:xx_xx/I:xx-xx" | |
+ "@xx+xx&xx-xx|xx+xx/J:1_5/K:2+2-9", | |
# k | |
"xx^sil-k+o=N/A:-4+1+5/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx" | |
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5" | |
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9", | |
# o | |
"sil^k-o+N=n/A:-4+1+5/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx" | |
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5" | |
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9", | |
# N (ん) | |
"k^o-N+n=i/A:-3+2+4/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx" | |
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5" | |
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9", | |
# n | |
"o^N-n+i=ch/A:-2+3+3/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx" | |
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5" | |
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9", | |
# i | |
"N^n-i+ch=i/A:-2+3+3/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx" | |
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5" | |
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9", | |
# ch | |
"n^i-ch+i=w/A:-1+4+2/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx" | |
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5" | |
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9", | |
# i | |
"i^ch-i+w=a/A:-1+4+2/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx" | |
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5" | |
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9", | |
# w | |
"ch^i-w+a=pau/A:0+5+1/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx" | |
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5" | |
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9", | |
# a | |
"i^w-a+pau=h/A:0+5+1/B:xx-xx_xx/C:09_xx+xx/D:09+xx_xx/E:xx_xx!xx_xx-xx" | |
+ "/F:5_5#0_xx@1_1|1_5/G:4_1%0_xx_0/H:xx_xx/I:1-5" | |
+ "@1+2&1-2|1+9/J:1_4/K:2+2-9", | |
# pau (読点) | |
"w^a-pau+h=i/A:xx+xx+xx/B:09-xx_xx/C:xx_xx+xx/D:09+xx_xx/E:5_5!0_xx-xx" | |
+ "/F:xx_xx#xx_xx@xx_xx|xx_xx/G:4_1%0_xx_xx/H:1_5/I:xx-xx" | |
+ "@xx+xx&xx-xx|xx+xx/J:1_4/K:2+2-9", | |
# h | |
"a^pau-h+i=h/A:0+1+4/B:09-xx_xx/C:09_xx+xx/D:22+xx_xx/E:5_5!0_xx-0" | |
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4" | |
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9", | |
# i | |
"pau^h-i+h=o/A:0+1+4/B:09-xx_xx/C:09_xx+xx/D:22+xx_xx/E:5_5!0_xx-0" | |
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4" | |
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9", | |
# h | |
"h^i-h+o=d/A:1+2+3/B:09-xx_xx/C:22_xx+xx/D:10+7_2/E:5_5!0_xx-0" | |
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4" | |
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9", | |
# o | |
"i^h-o+d=e/A:1+2+3/B:09-xx_xx/C:22_xx+xx/D:10+7_2/E:5_5!0_xx-0" | |
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4" | |
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9", | |
# d | |
"h^o-d+e=s/A:2+3+2/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0" | |
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4" | |
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9", | |
# e | |
"o^d-e+s=U/A:2+3+2/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0" | |
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4" | |
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9", | |
# s | |
"d^e-s+U=sil/A:3+4+1/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0" | |
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4" | |
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9", | |
# U (無声母音) | |
"e^s-U+sil=xx/A:3+4+1/B:22-xx_xx/C:10_7+2/D:xx+xx_xx/E:5_5!0_xx-0" | |
+ "/F:4_1#0_xx@1_1|1_4/G:xx_xx%xx_xx_xx/H:1_5/I:1-4" | |
+ "@2+1&2-1|6+4/J:xx_xx/K:2+2-9", | |
# sil (無音) | |
"s^U-sil+xx=xx/A:xx+xx+xx/B:10-7_2/C:xx_xx+xx/D:xx+xx_xx/E:4_1!0_xx-xx" | |
+ "/F:xx_xx#xx_xx@xx_xx|xx_xx/G:xx_xx%xx_xx_xx/H:1_4/I:xx-xx" | |
+ "@xx+xx&xx-xx|xx+xx/J:xx_xx/K:2+2-9", | |
] | |
labels_hello_hiho = [Label.from_feature(feature) for feature in test_case_hello_hiho] | |
def jointed_phonemes(ojt_container: OjtContainer) -> str: | |
"""コンテナインスタンスに直接的・間接的に含まれる全ラベルの音素文字を結合してを返す""" | |
return "".join([label.phoneme for label in ojt_container.labels]) | |
def space_jointed_phonemes(ojt_container: OjtContainer) -> str: | |
"""コンテナインスタンスに直接的・間接的に含まれる全ラベルの音素文字を ` ` 挟みながら結合してを返す""" | |
return " ".join([label.phoneme for label in ojt_container.labels]) | |
def test_label_phoneme() -> None: | |
"""Label に含まれる音素をテスト""" | |
assert ( | |
" ".join([label.phoneme for label in labels_hello_hiho]) | |
== "sil k o N n i ch i w a pau h i h o d e s U sil" | |
) | |
def test_label_is_pause() -> None: | |
"""Label のポーズ判定をテスト""" | |
assert [label.is_pause() for label in labels_hello_hiho] == [ | |
True, # sil | |
False, # k | |
False, # o | |
False, # N | |
False, # n | |
False, # i | |
False, # ch | |
False, # i | |
False, # w | |
False, # a | |
True, # pau | |
False, # h | |
False, # i | |
False, # h | |
False, # o | |
False, # d | |
False, # e | |
False, # s | |
False, # u | |
True, # sil | |
] | |
def test_label_feature() -> None: | |
"""Label に含まれる features をテスト""" | |
assert [ | |
contexts_to_feature(label.contexts) for label in labels_hello_hiho | |
] == test_case_hello_hiho | |
# contexts["a2"] == "1" ko | |
mora_hello_1 = MoraLabel(consonant=labels_hello_hiho[1], vowel=labels_hello_hiho[2]) | |
# contexts["a2"] == "2" N | |
mora_hello_2 = MoraLabel(consonant=None, vowel=labels_hello_hiho[3]) | |
# contexts["a2"] == "3" ni | |
mora_hello_3 = MoraLabel(consonant=labels_hello_hiho[4], vowel=labels_hello_hiho[5]) | |
# contexts["a2"] == "4" chi | |
mora_hello_4 = MoraLabel(consonant=labels_hello_hiho[6], vowel=labels_hello_hiho[7]) | |
# contexts["a2"] == "5" wa | |
mora_hello_5 = MoraLabel(consonant=labels_hello_hiho[8], vowel=labels_hello_hiho[9]) | |
# contexts["a2"] == "1" hi | |
mora_hiho_1 = MoraLabel(consonant=labels_hello_hiho[11], vowel=labels_hello_hiho[12]) | |
# contexts["a2"] == "2" ho | |
mora_hiho_2 = MoraLabel(consonant=labels_hello_hiho[13], vowel=labels_hello_hiho[14]) | |
# contexts["a2"] == "3" de | |
mora_hiho_3 = MoraLabel(consonant=labels_hello_hiho[15], vowel=labels_hello_hiho[16]) | |
# contexts["a2"] == "1" sU | |
mora_hiho_4 = MoraLabel(consonant=labels_hello_hiho[17], vowel=labels_hello_hiho[18]) | |
def test_mora_label_phonemes() -> None: | |
"""MoraLabel に含まれる音素系列をテスト""" | |
assert jointed_phonemes(mora_hello_1) == "ko" | |
assert jointed_phonemes(mora_hello_2) == "N" | |
assert jointed_phonemes(mora_hello_3) == "ni" | |
assert jointed_phonemes(mora_hello_4) == "chi" | |
assert jointed_phonemes(mora_hello_5) == "wa" | |
assert jointed_phonemes(mora_hiho_1) == "hi" | |
assert jointed_phonemes(mora_hiho_2) == "ho" | |
assert jointed_phonemes(mora_hiho_3) == "de" | |
assert jointed_phonemes(mora_hiho_4) == "sU" | |
def test_mora_label_features() -> None: | |
"""MoraLabel に含まれる features をテスト""" | |
expects = test_case_hello_hiho | |
assert features(mora_hello_1) == expects[1:3] | |
assert features(mora_hello_2) == expects[3:4] | |
assert features(mora_hello_3) == expects[4:6] | |
assert features(mora_hello_4) == expects[6:8] | |
assert features(mora_hello_5) == expects[8:10] | |
assert features(mora_hiho_1) == expects[11:13] | |
assert features(mora_hiho_2) == expects[13:15] | |
assert features(mora_hiho_3) == expects[15:17] | |
assert features(mora_hiho_4) == expects[17:19] | |
# TODO: ValueErrorを吐く作為的ではない自然な例の模索 | |
# 存在しないなら放置でよい | |
accent_phrase_hello = AccentPhraseLabel.from_labels(labels_hello_hiho[1:10]) | |
accent_phrase_hiho = AccentPhraseLabel.from_labels(labels_hello_hiho[11:19]) | |
def test_accent_phrase_accent() -> None: | |
"""AccentPhraseLabel に含まれるアクセント位置をテスト""" | |
assert accent_phrase_hello.accent == 5 | |
assert accent_phrase_hiho.accent == 1 | |
def test_accent_phrase_phonemes() -> None: | |
"""AccentPhraseLabel に含まれる音素系列をテスト""" | |
outputs_hello = space_jointed_phonemes(accent_phrase_hello) | |
outputs_hiho = space_jointed_phonemes(accent_phrase_hiho) | |
assert outputs_hello == "k o N n i ch i w a" | |
assert outputs_hiho == "h i h o d e s U" | |
def test_accent_phrase_features() -> None: | |
"""AccentPhraseLabel に含まれる features をテスト""" | |
expects = test_case_hello_hiho | |
assert features(accent_phrase_hello) == expects[1:10] | |
assert features(accent_phrase_hiho) == expects[11:19] | |
breath_group_hello = BreathGroupLabel.from_labels(labels_hello_hiho[1:10]) | |
breath_group_hiho = BreathGroupLabel.from_labels(labels_hello_hiho[11:19]) | |
def test_breath_group_phonemes() -> None: | |
"""BreathGroupLabel に含まれる音素系列をテスト""" | |
outputs_hello = space_jointed_phonemes(breath_group_hello) | |
outputs_hiho = space_jointed_phonemes(breath_group_hiho) | |
assert outputs_hello == "k o N n i ch i w a" | |
assert outputs_hiho == "h i h o d e s U" | |
def test_breath_group_features() -> None: | |
"""BreathGroupLabel に含まれる features をテスト""" | |
expects = test_case_hello_hiho | |
assert features(breath_group_hello) == expects[1:10] | |
assert features(breath_group_hiho) == expects[11:19] | |
utterance_hello_hiho = UtteranceLabel.from_labels(labels_hello_hiho) | |
def test_utterance_phonemes() -> None: | |
"""UtteranceLabel に含まれる音素系列をテスト""" | |
outputs_hello_hiho = space_jointed_phonemes(utterance_hello_hiho) | |
expects_hello_hiho = "sil k o N n i ch i w a pau h i h o d e s U sil" | |
assert outputs_hello_hiho == expects_hello_hiho | |
def test_utterance_features() -> None: | |
"""UtteranceLabel に含まれる features をテスト""" | |
assert features(utterance_hello_hiho) == test_case_hello_hiho | |
def test_voice() -> None: | |
assert mora_to_text("a") == "ア" | |
assert mora_to_text("i") == "イ" | |
assert mora_to_text("ka") == "カ" | |
assert mora_to_text("N") == "ン" | |
assert mora_to_text("cl") == "ッ" | |
assert mora_to_text("gye") == "ギェ" | |
assert mora_to_text("ye") == "イェ" | |
assert mora_to_text("wo") == "ウォ" | |
def test_unvoice() -> None: | |
assert mora_to_text("A") == "ア" | |
assert mora_to_text("I") == "イ" | |
assert mora_to_text("kA") == "カ" | |
assert mora_to_text("gyE") == "ギェ" | |
assert mora_to_text("yE") == "イェ" | |
assert mora_to_text("wO") == "ウォ" | |
def test_invalid_mora() -> None: | |
"""変なモーラが来ても例外を投げない""" | |
assert mora_to_text("x") == "x" | |
assert mora_to_text("") == "" | |
def _gen_mora(text: str, consonant: str | None, vowel: str) -> Mora: | |
return Mora( | |
text=text, | |
consonant=consonant, | |
consonant_length=0 if consonant else None, | |
vowel=vowel, | |
vowel_length=0, | |
pitch=0, | |
) | |
def test_text_to_accent_phrases_normal() -> None: | |
"""`text_to_accent_phrases` は正常な日本語文をパースする""" | |
# Inputs | |
text = "こんにちは、ヒホです。" | |
# Expects | |
true_accent_phrases = [ | |
AccentPhrase( | |
moras=[ | |
_gen_mora("コ", "k", "o"), | |
_gen_mora("ン", None, "N"), | |
_gen_mora("ニ", "n", "i"), | |
_gen_mora("チ", "ch", "i"), | |
_gen_mora("ワ", "w", "a"), | |
], | |
accent=5, | |
pause_mora=_gen_mora("、", None, "pau"), | |
), | |
AccentPhrase( | |
moras=[ | |
_gen_mora("ヒ", "h", "i"), | |
_gen_mora("ホ", "h", "o"), | |
_gen_mora("デ", "d", "e"), | |
_gen_mora("ス", "s", "U"), | |
], | |
accent=1, | |
pause_mora=None, | |
), | |
] | |
# Outputs | |
accent_phrases = text_to_accent_phrases(text) | |
# Tests | |
assert accent_phrases == true_accent_phrases | |
def stub_unknown_features_koxx(_: str) -> list[str]: | |
"""`sil-k-o-xx-sil` に相当する features を常に返す `text_to_features()` のStub""" | |
return [ | |
".^.-sil+.=./A:.+xx+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:xx_xx#xx_.@xx_.|._./G:._.%._._./H:._./I:.-.@xx+.&.-.|.+./J:._./K:.+.-.", | |
".^.-k+.=./A:.+1+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:2_1#0_.@1_.|._./G:._.%._._./H:._./I:.-.@1+.&.-.|.+./J:._./K:.+.-.", | |
".^.-o+.=./A:.+1+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:2_1#0_.@1_.|._./G:._.%._._./H:._./I:.-.@1+.&.-.|.+./J:._./K:.+.-.", | |
".^.-xx+.=./A:.+2+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:2_1#0_.@1_.|._./G:._.%._._./H:._./I:.-.@1+.&.-.|.+./J:._./K:.+.-.", | |
".^.-sil+.=./A:.+xx+./B:.-._./C:._.+./D:.+._./E:._.!._.-./F:xx_xx#xx_.@xx_.|._./G:._.%._._./H:._./I:.-.@xx+.&.-.|.+./J:._./K:.+.-.", | |
] | |
def test_text_to_accent_phrases_unknown() -> None: | |
"""`text_to_accent_phrases` は unknown 音素を含む features をパースする""" | |
# Expects | |
true_accent_phrases = [ | |
AccentPhrase( | |
moras=[ | |
_gen_mora("コ", "k", "o"), | |
_gen_mora("xx", None, "xx"), | |
], | |
accent=1, | |
pause_mora=None, | |
), | |
] | |
# Outputs | |
accent_phrases = text_to_accent_phrases( | |
"dummy", text_to_features=stub_unknown_features_koxx | |
) | |
# Tests | |
assert accent_phrases == true_accent_phrases | |