jhansss's picture
bug fixed
7a23964
import json
import re
import warnings
from pathlib import Path
from kanjiconv import KanjiConv
from pypinyin import lazy_pinyin
from .resources.pinyin_dict import PINYIN_DICT
kanji_to_kana = KanjiConv()
yoon_map = {
"ใ": "ใ‚",
"ใƒ": "ใ„",
"ใ…": "ใ†",
"ใ‡": "ใˆ",
"ใ‰": "ใŠ",
"ใ‚ƒ": "ใ‚„",
"ใ‚…": "ใ‚†",
"ใ‚‡": "ใ‚ˆ",
"ใ‚Ž": "ใ‚",
}
# ACE_phonemes
with open(Path(__file__).parent / "resources" / "all_plans.json", "r") as f:
ace_phonemes_all_plans = json.load(f)
for plan in ace_phonemes_all_plans["plans"]:
if plan["language"] == "zh":
ace_phonemes_zh_plan = plan
break
def preprocess_text(text: str, language: str) -> list[str]:
text = text.replace(" ", "")
if language == "mandarin":
text_list = to_pinyin(text)
elif language == "japanese":
text_list = to_kana(text)
else:
raise ValueError(f"Other languages are not supported")
return text_list
def to_pinyin(text: str) -> list[str]:
pinyin_list = lazy_pinyin(text)
text_list = []
for text in pinyin_list:
if text[0] == "S" or text[0] == "A" or text[0] == "-":
sp_strs = re.findall(r"-|AP|SP", text)
for phn in sp_strs:
text_list.append(phn)
else:
text_list.append(text)
return text_list
def replace_chouonpu(hiragana_text: str) -> str:
"""processใ€Œใƒผใ€since the previous packages didn't support"""
vowels = {
"ใ‚": "ใ‚",
"ใ„": "ใ„",
"ใ†": "ใ†",
"ใˆ": "ใˆ",
"ใŠ": "ใ†",
"ใ‹": "ใ‚",
"ใ": "ใ„",
"ใ": "ใ†",
"ใ‘": "ใˆ",
"ใ“": "ใ†",
"ใ•": "ใ‚",
"ใ—": "ใ„",
"ใ™": "ใ†",
"ใ›": "ใˆ",
"ใ": "ใ†",
"ใŸ": "ใ‚",
"ใก": "ใ„",
"ใค": "ใ†",
"ใฆ": "ใˆ",
"ใจ": "ใ†",
"ใช": "ใ‚",
"ใซ": "ใ„",
"ใฌ": "ใ†",
"ใญ": "ใˆ",
"ใฎ": "ใ†",
"ใฏ": "ใ‚",
"ใฒ": "ใ„",
"ใต": "ใ†",
"ใธ": "ใˆ",
"ใป": "ใ†",
"ใพ": "ใ‚",
"ใฟ": "ใ„",
"ใ‚€": "ใ†",
"ใ‚": "ใˆ",
"ใ‚‚": "ใ†",
"ใ‚„": "ใ‚",
"ใ‚†": "ใ†",
"ใ‚ˆ": "ใ†",
"ใ‚‰": "ใ‚",
"ใ‚Š": "ใ„",
"ใ‚‹": "ใ†",
"ใ‚Œ": "ใˆ",
"ใ‚": "ใ†",
"ใ‚": "ใ‚",
"ใ‚’": "ใ†",
}
new_text = []
for i, char in enumerate(hiragana_text):
if char == "ใƒผ" and i > 0:
prev_char = new_text[-1]
if prev_char in yoon_map:
prev_char = yoon_map[prev_char]
new_text.append(vowels.get(prev_char, prev_char))
else:
new_text.append(char)
return "".join(new_text)
def to_kana(text: str) -> list[str]:
hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", ""))
hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ")
final_ls = []
for subword in hiragana_text_wl:
sl_prev = 0
for i in range(len(subword) - 1):
if sl_prev >= len(subword) - 1:
break
sl = sl_prev + 1
if subword[sl] in yoon_map:
final_ls.append(subword[sl_prev : sl + 1])
sl_prev += 2
else:
final_ls.append(subword[sl_prev])
sl_prev += 1
final_ls.append(subword[sl_prev])
return final_ls
def kana_to_phonemes_openjtalk(kana: str) -> list[str]:
import pyopenjtalk
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
# add space between each character
kana = " ".join(list(kana))
# phones is a str object separated by space
phones = pyopenjtalk.g2p(kana, kana=False)
if len(w) > 0:
for warning in w:
if "No phoneme" in str(warning.message):
raise ValueError(f"No phoneme found for {kana}. {warning.message}")
phones = phones.split(" ")
return phones
def pinyin_to_phonemes_opencpop(pinyin: str) -> list[str]:
pinyin = pinyin.lower()
if pinyin in ace_phonemes_zh_plan["dict"]:
phns = ace_phonemes_zh_plan["dict"][pinyin]
return phns
elif pinyin in ace_phonemes_zh_plan["syllable_alias"]:
phns = ace_phonemes_zh_plan["dict"][
ace_phonemes_zh_plan["syllable_alias"][pinyin]
]
return phns
else:
raise ValueError(f"{pinyin} not registered in Opencpop phoneme dict")
def pinyin_to_phonemes_ace(pinyin: str) -> list[str]:
pinyin = pinyin.lower()
if pinyin in PINYIN_DICT:
phns = PINYIN_DICT[pinyin]
return phns
else:
raise ValueError(f"{pinyin} not registered in ACE phoneme dict")