Spaces:
Sleeping
Sleeping
import json | |
import re | |
import warnings | |
from pathlib import Path | |
from kanjiconv import KanjiConv | |
from pypinyin import lazy_pinyin | |
from .resources.pinyin_dict import PINYIN_DICT | |
kanji_to_kana = KanjiConv() | |
yoon_map = { | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
} | |
# ACE_phonemes | |
with open(Path(__file__).parent / "resources" / "all_plans.json", "r") as f: | |
ace_phonemes_all_plans = json.load(f) | |
for plan in ace_phonemes_all_plans["plans"]: | |
if plan["language"] == "zh": | |
ace_phonemes_zh_plan = plan | |
break | |
def preprocess_text(text: str, language: str) -> list[str]: | |
text = text.replace(" ", "") | |
if language == "mandarin": | |
text_list = to_pinyin(text) | |
elif language == "japanese": | |
text_list = to_kana(text) | |
else: | |
raise ValueError(f"Other languages are not supported") | |
return text_list | |
def to_pinyin(text: str) -> list[str]: | |
pinyin_list = lazy_pinyin(text) | |
text_list = [] | |
for text in pinyin_list: | |
if text[0] == "S" or text[0] == "A" or text[0] == "-": | |
sp_strs = re.findall(r"-|AP|SP", text) | |
for phn in sp_strs: | |
text_list.append(phn) | |
else: | |
text_list.append(text) | |
return text_list | |
def replace_chouonpu(hiragana_text: str) -> str: | |
"""processใใผใsince the previous packages didn't support""" | |
vowels = { | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใก": "ใ", | |
"ใค": "ใ", | |
"ใฆ": "ใ", | |
"ใจ": "ใ", | |
"ใช": "ใ", | |
"ใซ": "ใ", | |
"ใฌ": "ใ", | |
"ใญ": "ใ", | |
"ใฎ": "ใ", | |
"ใฏ": "ใ", | |
"ใฒ": "ใ", | |
"ใต": "ใ", | |
"ใธ": "ใ", | |
"ใป": "ใ", | |
"ใพ": "ใ", | |
"ใฟ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
"ใ": "ใ", | |
} | |
new_text = [] | |
for i, char in enumerate(hiragana_text): | |
if char == "ใผ" and i > 0: | |
prev_char = new_text[-1] | |
if prev_char in yoon_map: | |
prev_char = yoon_map[prev_char] | |
new_text.append(vowels.get(prev_char, prev_char)) | |
else: | |
new_text.append(char) | |
return "".join(new_text) | |
def to_kana(text: str) -> list[str]: | |
hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", "")) | |
hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ") | |
final_ls = [] | |
for subword in hiragana_text_wl: | |
sl_prev = 0 | |
for i in range(len(subword) - 1): | |
if sl_prev >= len(subword) - 1: | |
break | |
sl = sl_prev + 1 | |
if subword[sl] in yoon_map: | |
final_ls.append(subword[sl_prev : sl + 1]) | |
sl_prev += 2 | |
else: | |
final_ls.append(subword[sl_prev]) | |
sl_prev += 1 | |
final_ls.append(subword[sl_prev]) | |
return final_ls | |
def kana_to_phonemes_openjtalk(kana: str) -> list[str]: | |
import pyopenjtalk | |
with warnings.catch_warnings(record=True) as w: | |
warnings.simplefilter("always") | |
# add space between each character | |
kana = " ".join(list(kana)) | |
# phones is a str object separated by space | |
phones = pyopenjtalk.g2p(kana, kana=False) | |
if len(w) > 0: | |
for warning in w: | |
if "No phoneme" in str(warning.message): | |
raise ValueError(f"No phoneme found for {kana}. {warning.message}") | |
phones = phones.split(" ") | |
return phones | |
def pinyin_to_phonemes_opencpop(pinyin: str) -> list[str]: | |
pinyin = pinyin.lower() | |
if pinyin in ace_phonemes_zh_plan["dict"]: | |
phns = ace_phonemes_zh_plan["dict"][pinyin] | |
return phns | |
elif pinyin in ace_phonemes_zh_plan["syllable_alias"]: | |
phns = ace_phonemes_zh_plan["dict"][ | |
ace_phonemes_zh_plan["syllable_alias"][pinyin] | |
] | |
return phns | |
else: | |
raise ValueError(f"{pinyin} not registered in Opencpop phoneme dict") | |
def pinyin_to_phonemes_ace(pinyin: str) -> list[str]: | |
pinyin = pinyin.lower() | |
if pinyin in PINYIN_DICT: | |
phns = PINYIN_DICT[pinyin] | |
return phns | |
else: | |
raise ValueError(f"{pinyin} not registered in ACE phoneme dict") | |