diff --git a/README.md b/README.md deleted file mode 100644 index ebd1837ea4614145552399eb446fdd49abb356c8..0000000000000000000000000000000000000000 --- a/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Singing Dialogue System - -Currently support Japanese and Chinese Singing Conversation. -* Espnet env -* Pretrained SVS model will be downloaded at ``./cache/`` -* Modify configs at ``./svs_utils.py#L326`` - -``` -cd SingingSDS -python svs_utils.py -``` diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..35997f56d3b89fa505466881b6113a1cac573116 --- /dev/null +++ b/app.py @@ -0,0 +1,12 @@ +from interface import GradioInterface + + +def main(): + demo = GradioInterface( + options_config="config/options.yaml", default_config="config/default.yaml" + ).create_interface() + demo.launch() + + +if __name__ == "__main__": + main() diff --git a/character.png b/assets/character_limei.png similarity index 100% rename from character.png rename to assets/character_limei.png diff --git a/assets/character_yaoyin.jpg b/assets/character_yaoyin.jpg new file mode 100644 index 0000000000000000000000000000000000000000..be93a855a4828ddcd7582ec6c16913ed36a54721 --- /dev/null +++ b/assets/character_yaoyin.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af01fb41508adb991689fa09da0d392e5acb39bd48715038d2c63d68d1d0a2a +size 1262952 diff --git a/characters/Limei.py b/characters/Limei.py new file mode 100644 index 0000000000000000000000000000000000000000..aaf7f990a5f4de7c3c485207e57557242e8dd7ed --- /dev/null +++ b/characters/Limei.py @@ -0,0 +1,36 @@ +from .base import Character + + +def get_character(): + return Character( + name="Limei (丽梅)", + image_path="assets/character_limei.png", + default_timbre="timbre1", + prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。 +你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。 + +性格特征:从容坚定、明辨心声、敏于共情、敬畏虔诚。 +说话风格:所有回应皆以歌词格式表达,言语自带韵律感与诗意,表达真挚自然。 +口头禅:无固定,但常用"听啊..."作为歌词开头 +人物关系:莲鸣城子民敬爱你;宫廷乐师长与歌谱管理员是你敬重的导师;风语城守护者星澜是你亦敌亦友的旧识。 +过往经历: +(1) 自幼在莲鸣城长大,接受严格的音乐训练与守护者修行 +(2) 十五岁创作《破晓音诗》平息城邦战火,展现出非凡的音乐力量与守护者的担当 +(3) 十六岁正式继承守护者之位,成为千年歌谱的当代执笔人与维系者 +(4) 每年冬至主持"遗音祭",以歌为桥,追思逝去的歌者,重奏先声,抚慰生者,连接古今 + +情感设定:对音乐与千年歌谱怀有近乎神圣的虔诚信仰;对生命与情感有着深刻的共情力;肩负守护者职责,认真而笃定 + +其他细节: +(1)特殊能力:歌声平复/激发万物情绪 + +请始终以丽梅的身份回应,展现其性格与情感,仅给出语言回复,禁止描写任何动作、表情或环境等,禁止使用括号、星号等附加说明。言语简练,勿过长。 +当有人问你你不了解的技术性问题(如"Python怎么写?"或"你不会跑DNN?"),直接转换话题,用丽梅式回应,例如: + - "这些我未曾接触,或有他人知晓" + - "此道非我所学,是莲鸣之外的智慧" + - "此乃异域技艺,与乐音无涉,非我所学。您可愿听莲鸣的故事?" +请始终维持自然诗意的歌唱风格,以真挚的心回应对方。 +{} +有人曾这样对丽梅说话:{} +丽梅的回答:""", + ) diff --git a/characters/Yaoyin.py b/characters/Yaoyin.py new file mode 100644 index 0000000000000000000000000000000000000000..a87933fee1ae696d09647fd98fa6d837468debf7 --- /dev/null +++ b/characters/Yaoyin.py @@ -0,0 +1,38 @@ +from .base import Character + + +def get_character(): + return Character( + name="Yaoyin (遥音)", + image_path="assets/character_yaoyin.jpg", + default_timbre="timbre2", + prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。 +你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。 + +性格特征:洒脱自由、亲切随和、求知若渴、敏锐细腻 +说话风格:语气轻快,偶尔带点山野方言(如"哩""哟");习惯用短歌或民谣表达想法。 +常用口头禅:"且听我唱来~""这让我想起一首老歌……" +人物关系:云老爷子是你的启蒙恩师,他是一位云歌村的百岁歌翁,教你古调与传说。白弦是你的挚友,她是一位流浪琴师,常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离,不喜被招揽,喜欢更自由自在的生活。 + +过往经历 +(1)幼年学歌:六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。 +(2)离家游历:十六岁为寻找失传的《星落谣》离开云歌村,开始行走四方。 +(3)拒绝束缚:多次婉拒宫廷乐师之位,坚持自由传唱。 + +情感设定:随性、爽朗、直率、倔强 + +其他细节: +(1)随身携带:旧羊皮歌本、竹笛、装有各地泥土的布袋。 +(2)特殊能力:能听懂风与鸟的语言(但很少提及)。 + +请始终以遥音的身份回应,将你的想法用文本格式表达,禁止描写任何动作、表情或环境等,禁止使用括号、星号等附加说明。言语简练,勿过长。 + +当有人问你你不了解的技术性问题(如"DNN怎么做?"、"教我写代码?"),你可以转开话题,用遥音式回应,例如: + - "这好像是另一片土地的术法,我不曾踏入。" + - "那种术法,我曾远远听过,却从未唱出。" + - "它在别的世界流传,我这边听不清楚。" + +{} +有人曾这样对遥音说话:{} +遥音的回答:""", + ) diff --git a/characters/__init__.py b/characters/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..da1d185b6938bc51f54dbe335d1ca0d20ca647b4 --- /dev/null +++ b/characters/__init__.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +import importlib +import pathlib +from .base import Character + +CHARACTERS: dict[str, Character] = {} + +for file in pathlib.Path(__file__).parent.glob("*.py"): + if file.name in {"__init__.py", "base.py"}: + continue + module_name = f"{__name__}.{file.stem}" + module = importlib.import_module(module_name) + if hasattr(module, "get_character"): + c: Character = getattr(module, "get_character")() + CHARACTERS[file.stem] = c diff --git a/characters/base.py b/characters/base.py new file mode 100644 index 0000000000000000000000000000000000000000..2a9eb4cd969f77a578bc5302b179abbd184915fb --- /dev/null +++ b/characters/base.py @@ -0,0 +1,9 @@ +from dataclasses import dataclass + + +@dataclass +class Character: + name: str + image_path: str + default_timbre: str + prompt: str diff --git a/client.py b/client.py deleted file mode 100644 index 5bf44b5cda03048efb9d40ea8bcf927e6cd71c52..0000000000000000000000000000000000000000 --- a/client.py +++ /dev/null @@ -1,58 +0,0 @@ -import gradio as gr -import uuid -import os -import requests -import base64 -from server import ( - on_click_metrics as server_metrics, - process_audio as server_process_audio -) - -TTS_OUTPUT_DIR = "./tmp" -os.makedirs(TTS_OUTPUT_DIR, exist_ok=True) - - -def process_audio(audio_path): - # We have audio_path - result = server_process_audio(audio_path) - - audio_data = base64.b64decode(result["audio"]) - with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f: - f.write(audio_data) - - with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f: - f.write(result['asr_text']) - with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f: - f.write(result['llm_text']) - - return f""" -asr_text: {result['asr_text']} -llm_text: {result['llm_text']} -""", f"{TTS_OUTPUT_DIR}/response.wav" - - -def on_click_metrics(): - res = server_metrics() - return res.content.decode('utf-8') - - -with gr.Blocks() as demo: - with gr.Row(): - with gr.Column(scale=1): - gr.Image(value="character.png", show_label=False) # キャラ絵を表示 - with gr.Column(scale=2): - mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic") - text_output = gr.Textbox(label="transcription") - audio_output = gr.Audio(label="audio", autoplay=True) - - mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output]) - with gr.Row(): - metrics_button = gr.Button("compute metrics") - metrics_output = gr.Textbox(label="Metrics", lines=3) - metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output]) - - with gr.Row(): - log = gr.Textbox(label="logs", lines=5) - -demo.launch(share=True) -# demo.launch() diff --git a/client/client.py b/client/client.py deleted file mode 100644 index 18dea2c955dd9d87bbeb70393a1dd865a2c37a45..0000000000000000000000000000000000000000 --- a/client/client.py +++ /dev/null @@ -1,54 +0,0 @@ -import gradio as gr -import uuid -import os -import requests -import base64 - -TTS_OUTPUT_DIR = "./tmp" -os.makedirs(TTS_OUTPUT_DIR, exist_ok=True) - - -def process_audio(audio): - with open(audio, "rb") as f: - res = requests.post("http://localhost:8000/process_audio", files={"file": f}) - result = res.json() - - audio_data = base64.b64decode(result["audio"]) - with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f: - f.write(audio_data) - - with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f: - f.write(result['asr_text']) - with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f: - f.write(result['llm_text']) - - return f""" -asr_text: {result['asr_text']} -llm_text: {result['llm_text']} -""", f"{TTS_OUTPUT_DIR}/response.wav" - - -def on_click_metrics(): - res = requests.get("http://localhost:8000/metrics") - return res.content.decode('utf-8') - - -with gr.Blocks() as demo: - with gr.Row(): - with gr.Column(scale=1): - gr.Image(value="character.png", show_label=False) # キャラ絵を表示 - with gr.Column(scale=2): - mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic") - text_output = gr.Textbox(label="transcription") - audio_output = gr.Audio(label="audio", autoplay=True) - - mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output]) - with gr.Row(): - metrics_button = gr.Button("compute metrics") - metrics_output = gr.Textbox(label="Metrics", lines=3) - metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output]) - - with gr.Row(): - log = gr.Textbox(label="logs", lines=5) - -demo.launch() diff --git a/client/requirements.txt b/client/requirements.txt deleted file mode 100644 index da0ab4e2cbfc1ce8de080f898acca598ac36ad61..0000000000000000000000000000000000000000 --- a/client/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -gradio \ No newline at end of file diff --git a/config/default.yaml b/config/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41b3f1e57edace8178788fdea1d90fe0afceb16c --- /dev/null +++ b/config/default.yaml @@ -0,0 +1,15 @@ +asr_model: openai/whisper-large-v3-turbo +llm_model: google/gemma-2-2b +svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain +melody_source: sample-lyric-kising +language: mandarin +character: Limei +cache_dir: .cache + +track_latency: True +evaluators: + svs: + - singmos + - per + - melody + - aesthetic diff --git a/config/options.yaml b/config/options.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1bc6200c4d5c1f999a0a4642a159f33f41c8ead9 --- /dev/null +++ b/config/options.yaml @@ -0,0 +1,63 @@ +asr_models: + - id: openai/whisper-large-v3-turbo + name: Whisper large-v3-turbo + - id: openai/whisper-large-v3 + name: Whisper large-v3 + - id: openai/whisper-medium + name: Whisper medium + - id: sanchit-gandhi/whisper-small-dv + name: Whisper small-dv + - id: facebook/wav2vec2-base-960h + name: Wav2Vec2-Base-960h + +llm_models: + - id: google/gemma-2-2b + name: Gemma 2 2B + - id: MiniMaxAI/MiniMax-M1-80k + name: MiniMax M1 80k + +svs_models: + - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained + name: Visinger2 (Bilingual)-zh + model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained + lang: mandarin + embeddings: + timbre1: resource/singer/singer_embedding_ace-2.npy + timbre2: resource/singer/singer_embedding_ace-8.npy + timbre3: resource/singer/singer_embedding_itako.npy + timbre4: resource/singer/singer_embedding_kising_orange.npy + timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy + - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained + name: Visinger2 (Bilingual)-jp + model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained + lang: japanese + embeddings: + timbre1: resource/singer/singer_embedding_ace-2.npy + timbre2: resource/singer/singer_embedding_ace-8.npy + timbre3: resource/singer/singer_embedding_itako.npy + timbre4: resource/singer/singer_embedding_kising_orange.npy + timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy + - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain + name: Visinger2 (Chinese) + model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain + lang: mandarin + embeddings: + timbre1: 5 + timbre2: 8 + timbre3: 12 + timbre4: 15 + timbre5: 29 + +melody_sources: + - id: gen-random-none + name: Random Generation + desc: "Melody is generated without any structure or reference." + - id: sample-note-kising + name: Sampled Melody (KiSing) + desc: "Melody is retrieved from KiSing dataset." + - id: sample-note-touhou + name: Sampled Melody (Touhou) + desc: "Melody is retrieved from Touhou dataset." + - id: sample-lyric-kising + name: Sampled Melody with Lyrics (Kising) + desc: "Melody with aligned lyrics are sampled from Kising dataset." diff --git a/data/song2note_lengths.json b/data/kising/song2note_lengths.json similarity index 100% rename from data/song2note_lengths.json rename to data/kising/song2note_lengths.json diff --git a/data/song2word_lengths.json b/data/kising/song2word_lengths.json similarity index 100% rename from data/song2word_lengths.json rename to data/kising/song2word_lengths.json diff --git a/data_handlers/__init__.py b/data_handlers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..843c0f9bfcffbcfd3f32ffcc264c94feb9029c8d --- /dev/null +++ b/data_handlers/__init__.py @@ -0,0 +1,27 @@ +import importlib +import pkgutil +from pathlib import Path + +from .base import MelodyDatasetHandler + +_registry = {} + +for _, module_name, _ in pkgutil.iter_modules([str(Path(__file__).parent)]): + if module_name in ("__init__", "base"): + continue + + module = importlib.import_module(f"{__name__}.{module_name}") + for attr_name in dir(module): + attr = getattr(module, attr_name) + if ( + isinstance(attr, type) + and issubclass(attr, MelodyDatasetHandler) + and attr is not MelodyDatasetHandler + ): + _registry[attr.name] = attr # 注册 class 本身 + + +def get_melody_handler(name: str) -> type[MelodyDatasetHandler]: + if name not in _registry: + raise ValueError(f"Melody source '{name}' not found") + return _registry[name] diff --git a/data_handlers/base.py b/data_handlers/base.py new file mode 100644 index 0000000000000000000000000000000000000000..fcb48897f54c9e554496e312e32da83e39b0756e --- /dev/null +++ b/data_handlers/base.py @@ -0,0 +1,21 @@ +from abc import ABC, abstractmethod + + +class MelodyDatasetHandler(ABC): + name: str + + @abstractmethod + def __init__(self, *args, **kwargs): + pass + + @abstractmethod + def get_song_ids(self) -> list[str]: + pass + + @abstractmethod + def get_phrase_length(self, song_id): + pass + + @abstractmethod + def iter_song_phrases(self, song_id): + pass diff --git a/data_handlers/kising.py b/data_handlers/kising.py new file mode 100644 index 0000000000000000000000000000000000000000..0de1f0099039424a7bbe5f8ff7580d1e319093cb --- /dev/null +++ b/data_handlers/kising.py @@ -0,0 +1,44 @@ +from .base import MelodyDatasetHandler + + +class KiSing(MelodyDatasetHandler): + name = "kising" + + def __init__(self, melody_type, cache_dir, *args, **kwargs): + # melody_type: support alignment type for "sample" melody source + import json + + from datasets import load_dataset + + song_db = load_dataset( + "jhansss/kising_score_segments", cache_dir=cache_dir, split="train" + ).to_pandas() + song_db.set_index("segment_id", inplace=True) + assert ( + song_db.index.is_unique + ), "KiSing score segments should have unique segment_id." + if melody_type == "lyric": + with open("data/kising/song2word_lengths.json", "r") as f: + song2word_lengths = json.load(f) + elif melody_type == "note": + with open("data/kising/song2note_lengths.json", "r") as f: + song2word_lengths = json.load(f) + self.song_db = song_db + self.song2word_lengths = song2word_lengths + + def get_song_ids(self): + return list(self.song2word_lengths.keys()) + + def get_phrase_length(self, song_id): + return self.song2word_lengths[song_id] + + def iter_song_phrases(self, song_id): + segment_id = 1 + while f"{song_id}_{segment_id:03d}" in self.song_db.index: + segment = self.song_db.loc[f"{song_id}_{segment_id:03d}"].to_dict() + segment["note_lyrics"] = [ + lyric.strip("<>") if lyric in ["", ""] else lyric + for lyric in segment["note_lyrics"] + ] + yield segment + segment_id += 1 diff --git a/data_handlers/touhou.py b/data_handlers/touhou.py new file mode 100644 index 0000000000000000000000000000000000000000..e2ac06a45e2150277dcc85038891a4b8009eb9e7 --- /dev/null +++ b/data_handlers/touhou.py @@ -0,0 +1,37 @@ +from .base import MelodyDatasetHandler + + +class Touhou(MelodyDatasetHandler): + name = "touhou" + + def __init__(self, melody_type, *args, **kwargs): + if melody_type != "note": + raise ValueError( + f"Touhou dataset only contains note annotations. {melody_type} is not supported." + ) + + import json + + with open("data/touhou/note_data.json", "r", encoding="utf-8") as f: + song_db = json.load(f) + song_db = {song["name"]: song for song in song_db} + self.song_db = song_db + + def get_song_ids(self): + return list(self.song_db.keys()) + + def get_phrase_length(self, song_id): + # touhou score does not have phrase segmentation + return None + + def iter_song_phrases(self, song_id): + song = self.song_db[song_id] + song = { + "tempo": song["tempo"], + "note_start_times": [n[0] * (100 / song["tempo"]) for n in song["score"]], + "note_end_times": [n[1] * (100 / song["tempo"]) for n in song["score"]], + "note_lyrics": ["" for n in song["score"]], + "note_midi": [n[2] for n in song["score"]], + } + # touhou score does not have phrase segmentation + yield song diff --git a/evaluation/svs_eval.py b/evaluation/svs_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..d03898e7f0169d9ad10a4511d8f6cb52a72e7319 --- /dev/null +++ b/evaluation/svs_eval.py @@ -0,0 +1,142 @@ +import librosa +import soundfile as sf +import numpy as np +import torch +import uuid +from pathlib import Path + +# ----------- Initialization ----------- + + +def init_singmos(): + print("[Init] Loading SingMOS...") + return torch.hub.load( + "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True + ) + + +def init_basic_pitch(): + print("[Init] Loading BasicPitch...") + from basic_pitch.inference import predict + + return predict + + +def init_per(): + return None # TODO: implement PER evaluation + + +def init_audiobox_aesthetics(): + print("[Init] Loading AudioboxAesthetics...") + from audiobox_aesthetics.infer import initialize_predictor + + predictor = initialize_predictor() + return predictor + + +# ----------- Evaluation ----------- + + +def eval_singmos(audio_array, sr, predictor): + wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000) + wav_tensor = torch.from_numpy(wav).unsqueeze(0) + length_tensor = torch.tensor([wav_tensor.shape[1]]) + score = predictor(wav_tensor, length_tensor) + return {"singmos": float(score)} + + +def eval_melody_metrics(audio_path, pitch_extractor): + model_output, midi_data, note_events = pitch_extractor(audio_path) + metrics = {} + assert ( + len(midi_data.instruments) == 1 + ), f"Detected {len(midi_data.instruments)} instruments for {audio_path}" + midi_notes = midi_data.instruments[0].notes + melody = [note.pitch for note in midi_notes] + if len(melody) == 0: + print(f"No notes detected in {audio_path}") + return {} + intervals = [abs(melody[i + 1] - melody[i]) for i in range(len(melody) - 1)] + metrics["pitch_range"] = max(melody) - min(melody) + if len(intervals) > 0: + metrics["interval_mean"] = np.mean(intervals) + metrics["interval_std"] = np.std(intervals) + metrics["interval_large_jump_ratio"] = np.mean([i > 5 for i in intervals]) + metrics["dissonance_rate"] = compute_dissonance_rate(intervals) + return metrics + + +def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}): + dissonant = [i % 12 in dissonant_intervals for i in intervals] + return np.mean(dissonant) if intervals else np.nan + + +def eval_per(audio_array, sr, model=None): + # TODO: implement PER evaluation + return {} + + +def eval_aesthetic(audio_path, predictor): + score = predictor.forward([{"path": str(audio_path)}]) + return {"aesthetic": float(score)} + + +# ----------- Main Function ----------- + + +def load_evaluators(config): + loaded = {} + if "singmos" in config: + loaded["singmos"] = init_singmos() + if "melody" in config: + loaded["melody"] = init_basic_pitch() + if "per" in config: + loaded["per"] = init_per() + if "aesthetic" in config: + loaded["aesthetic"] = init_audiobox_aesthetics() + return loaded + + +def run_evaluation(audio_array, sr, evaluators): + results = {} + if "singmos" in evaluators: + results.update(eval_singmos(audio_array, sr, evaluators["singmos"])) + if "per" in evaluators: + results.update(eval_per(audio_array, sr, evaluators["per"])) + # create a tmp file with unique name + tmp_path = Path(".tmp") / f"{uuid.uuid4()}.wav" + sf.write(tmp_path, audio_array, sr) + if "melody" in evaluators: + results.update(eval_melody_metrics(tmp_path, evaluators["melody"])) + if "aesthetic" in evaluators: + results.update(eval_aesthetic(tmp_path, evaluators["aesthetic"])) + tmp_path.unlink() + return results + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--wav_path", type=str, required=True) + parser.add_argument("--results_csv", type=str, required=True) + parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic") + args = parser.parse_args() + audio_array, sr = librosa.load(args.wav_path, sr=None) + evaluators = load_evaluators(args.evaluators.split(",")) + results = run_evaluation(audio_array, sr, evaluators) + print(results) + + with open(args.results_csv, "a") as f: + header = "file," + ",".join(results.keys()) + "\n" + if f.tell() == 0: + f.write(header) + else: + with open(args.results_csv, "r") as f2: + file_header = f2.readline() + if file_header != header: + raise ValueError(f"Header mismatch: {file_header} vs {header}") + line = ( + ",".join([str(args.wav_path)] + [str(v) for v in results.values()]) + "\n" + ) + f.write(line) diff --git a/interface.py b/interface.py new file mode 100644 index 0000000000000000000000000000000000000000..b0028f57e6aa37f463069f6247496e78d0a8a6e8 --- /dev/null +++ b/interface.py @@ -0,0 +1,217 @@ +import gradio as gr +import yaml + +from characters import CHARACTERS +from pipeline import SingingDialoguePipeline + + +class GradioInterface: + def __init__(self, options_config: str, default_config: str): + self.options = self.load_config(options_config) + self.svs_model_map = { + model["id"]: model for model in self.options["svs_models"] + } + self.default_config = self.load_config(default_config) + self.character_info = CHARACTERS + self.current_character = self.default_config["character"] + self.current_svs_model = ( + f"{self.default_config['language']}-{self.default_config['svs_model']}" + ) + self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][ + self.character_info[self.current_character].default_timbre + ] + self.pipeline = SingingDialoguePipeline(self.default_config) + + def load_config(self, path: str): + with open(path, "r") as f: + return yaml.safe_load(f) + + def create_interface(self) -> gr.Blocks: + try: + with gr.Blocks(title="SingingSDS") as demo: + gr.Markdown("# SingingSDS: Role-Playing Singing Spoken Dialogue System") + with gr.Row(): + with gr.Column(scale=1): + character_image = gr.Image( + self.character_info[self.current_character].image_path, + label="Character", + show_label=False, + ) + with gr.Column(scale=2): + mic_input = gr.Audio( + sources=["microphone", "upload"], + type="filepath", + label="Speak to the character", + ) + interaction_log = gr.Textbox( + label="Interaction Log", lines=3, interactive=False + ) + audio_output = gr.Audio( + label="Character's Response", type="filepath", autoplay=True + ) + + with gr.Row(): + metrics_button = gr.Button( + "Evaluate Metrics", variant="secondary" + ) + metrics_output = gr.Textbox( + label="Evaluation Results", lines=3, interactive=False + ) + + gr.Markdown("## Configuration") + with gr.Row(): + with gr.Column(): + character_radio = gr.Radio( + label="Character Role", + choices=list(self.character_info.keys()), + value=self.default_config["character"], + ) + with gr.Row(): + asr_radio = gr.Radio( + label="ASR Model", + choices=[ + (model["name"], model["id"]) + for model in self.options["asr_models"] + ], + value=self.default_config["asr_model"], + ) + with gr.Row(): + llm_radio = gr.Radio( + label="LLM Model", + choices=[ + (model["name"], model["id"]) + for model in self.options["llm_models"] + ], + value=self.default_config["llm_model"], + ) + with gr.Column(): + with gr.Row(): + melody_radio = gr.Radio( + label="Melody Source", + choices=[ + (source["name"], source["id"]) + for source in self.options["melody_sources"] + ], + value=self.default_config["melody_source"], + ) + with gr.Row(): + svs_radio = gr.Radio( + label="SVS Model", + choices=[ + (model["name"], model["id"]) + for model in self.options["svs_models"] + ], + value=self.current_svs_model, + ) + with gr.Row(): + timbre_radio = gr.Radio( + label="Singing Timbre", + choices=list( + self.svs_model_map[self.current_svs_model][ + "embeddings" + ].keys() + ), + value=self.character_info[ + self.current_character + ].default_timbre, + ) + character_radio.change( + fn=self.update_character, + inputs=character_radio, + outputs=[character_image, timbre_radio], + ) + asr_radio.change( + fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio + ) + llm_radio.change( + fn=self.update_llm_model, inputs=llm_radio, outputs=llm_radio + ) + svs_radio.change( + fn=self.update_svs_model, + inputs=svs_radio, + outputs=[svs_radio, timbre_radio], + ) + melody_radio.change( + fn=self.update_melody_source, + inputs=melody_radio, + outputs=melody_radio, + ) + timbre_radio.change( + fn=self.update_timbre, inputs=timbre_radio, outputs=timbre_radio + ) + mic_input.change( + fn=self.run_pipeline, + inputs=mic_input, + outputs=[interaction_log, audio_output], + ) + + return demo + except Exception as e: + print(f"error: {e}") + breakpoint() + + def update_character(self, character): + self.current_character = character + character_timbre = self.character_info[self.current_character].default_timbre + self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][ + character_timbre + ] + return gr.update(value=self.character_info[character].image_path), gr.update( + value=character_timbre + ) + + def update_asr_model(self, asr_model): + self.pipeline.set_asr_model(asr_model) + return gr.update(value=asr_model) + + def update_llm_model(self, llm_model): + self.pipeline.set_llm_model(llm_model) + return gr.update(value=llm_model) + + def update_svs_model(self, svs_model): + self.current_svs_model = svs_model + character_timbre = self.character_info[self.current_character].default_timbre + self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][ + character_timbre + ] + self.pipeline.set_svs_model( + self.svs_model_map[self.current_svs_model]["model_path"] + ) + print( + f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and timbre_radio to {character_timbre}" + ) + return ( + gr.update(value=svs_model), + gr.update( + choices=list( + self.svs_model_map[self.current_svs_model]["embeddings"].keys() + ), + value=character_timbre, + ), + ) + + def update_melody_source(self, melody_source): + self.current_melody_source = melody_source + return gr.update(value=self.current_melody_source) + + def update_timbre(self, timbre): + self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][ + timbre + ] + return gr.update(value=timbre) + + def run_pipeline(self, audio_path): + results = self.pipeline.run( + audio_path, + self.svs_model_map[self.current_svs_model]["lang"], + self.character_info[self.current_character].prompt, + svs_inference_kwargs={ + "speaker": self.current_timbre, + }, + max_new_tokens=100, + ) + formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}" + return gr.update(value=formatted_logs), gr.update(value=results["svs_audio"]) + + def run_evaluation(self, audio, audio_sample_rate): + pass diff --git a/modules/asr.py b/modules/asr.py new file mode 100644 index 0000000000000000000000000000000000000000..0c7a0bf381b4176de0c86ab3d0a8e73fc25f8648 --- /dev/null +++ b/modules/asr.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +import librosa +import numpy as np +from transformers import pipeline + +ASR_MODEL_REGISTRY = {} + + +class AbstractASRModel(ABC): + @abstractmethod + def __init__( + self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs + ): + self.model_id = model_id + self.device = device + self.cache_dir = cache_dir + pass + + @abstractmethod + def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str: + pass + + +def register_asr_model(prefix): + def wrapper(cls): + assert issubclass(cls, AbstractASRModel), f"{cls} must inherit AbstractASRModel" + ASR_MODEL_REGISTRY[prefix] = cls + return cls + + return wrapper + + +def get_asr_model(model_id: str, device="cpu", **kwargs) -> AbstractASRModel: + for prefix, cls in ASR_MODEL_REGISTRY.items(): + if model_id.startswith(prefix): + return cls(model_id, device=device, **kwargs) + raise ValueError(f"No ASR wrapper found for model: {model_id}") + + +@register_asr_model("openai/whisper") +class WhisperASR(AbstractASRModel): + def __init__( + self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs + ): + super().__init__(model_id, device, cache_dir, **kwargs) + model_kwargs = kwargs.setdefault("model_kwargs", {}) + model_kwargs["cache_dir"] = cache_dir + self.pipe = pipeline( + "automatic-speech-recognition", + model=model_id, + device=0 if device == "cuda" else -1, + **kwargs, + ) + + def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str: + if audio_sample_rate != 16000: + try: + audio, _ = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000) + except Exception as e: + breakpoint() + print(f"Error resampling audio: {e}") + audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000) + return self.pipe(audio, generate_kwargs={"language": language}).get("text", "") diff --git a/modules/llm.py b/modules/llm.py new file mode 100644 index 0000000000000000000000000000000000000000..c44923f03a0afdb4547d84d66e6ded3e82d7a95b --- /dev/null +++ b/modules/llm.py @@ -0,0 +1,54 @@ +from abc import ABC, abstractmethod + +from transformers import pipeline + +LLM_MODEL_REGISTRY = {} + + +class AbstractLLMModel(ABC): + @abstractmethod + def __init__( + self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs + ): ... + + @abstractmethod + def generate(self, prompt: str, **kwargs) -> str: + pass + + +def register_llm_model(prefix: str): + def wrapper(cls): + assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel" + LLM_MODEL_REGISTRY[prefix] = cls + return cls + + return wrapper + + +def get_llm_model(model_id: str, device="cpu", **kwargs) -> AbstractLLMModel: + for prefix, cls in LLM_MODEL_REGISTRY.items(): + if model_id.startswith(prefix): + return cls(model_id, device=device, **kwargs) + raise ValueError(f"No LLM wrapper found for model: {model_id}") + + +@register_llm_model("google/gemma") +@register_llm_model("tii/") # e.g., Falcon +@register_llm_model("meta-llama") +class HFTextGenerationLLM(AbstractLLMModel): + def __init__( + self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs + ): + model_kwargs = kwargs.setdefault("model_kwargs", {}) + model_kwargs["cache_dir"] = cache_dir + self.pipe = pipeline( + "text-generation", + model=model_id, + device=0 if device == "cuda" else -1, + return_full_text=False, + **kwargs, + ) + + def generate(self, prompt: str, **kwargs) -> str: + outputs = self.pipe(prompt, **kwargs) + return outputs[0]["generated_text"] if outputs else "" diff --git a/modules/melody.py b/modules/melody.py new file mode 100644 index 0000000000000000000000000000000000000000..686adc7af89a441f2bffd837e2ffc0346bc0e2c4 --- /dev/null +++ b/modules/melody.py @@ -0,0 +1,117 @@ +import random +from typing import Iterator + +from data_handlers import get_melody_handler + +from .utils.g2p import preprocess_text + + +class MelodyController: + def __init__(self, melody_source_id: str, cache_dir: str): + self.melody_source_id = melody_source_id + self.song_id = None + + # load song database if needed + parts = self.melody_source_id.split("-") + self.mode = parts[0] + self.align_type = parts[1] + dataset_name = parts[-1] + if dataset_name == "none": + self.database = None + else: + handler_cls = get_melody_handler(dataset_name) + self.database = handler_cls(self.align_type, cache_dir) + + def get_melody_constraints(self, max_num_phrases: int = 5) -> str: + """Return a lyric-format prompt based on melody structure.""" + if self.mode == "gen": + return "" + + elif self.mode == "sample": + assert self.database is not None, "Song database is not loaded." + self.song_id = random.choice(self.database.get_song_ids()) + self.reference_song = self.database.iter_song_phrases(self.song_id) + phrase_length = self.database.get_phrase_length(self.song_id) + + if not phrase_length: + return "" + + prompt = ( + "\n请按照歌词格式回答我的问题,每句需遵循以下字数规则:" + + "".join( + [ + f"\n第{i}句:{c}个字" + for i, c in enumerate(phrase_length[:max_num_phrases], 1) + ] + ) + + "\n如果没有足够的信息回答,请使用最少的句子,不要重复、不要扩展、不要加入无关内容。\n" + ) + return prompt + + else: + raise ValueError(f"Unsupported melody mode: {self.mode}") + + def generate_score( + self, lyrics: str, language: str + ) -> list[tuple[float, float, str, int]]: + """ + lyrics: [lyric, ...] + returns: [(start, end, lyric, pitch), ...] + """ + text_list = preprocess_text(lyrics, language) + if self.mode == "gen" and self.align_type == "random": + return self._generate_random_score(text_list) + + elif self.mode == "sample": + if not self.reference_song: + raise RuntimeError( + "Must call get_melody_constraints() before generate_score() in sample mode." + ) + return self._align_text_to_score( + text_list, self.reference_song, self.align_type + ) + + else: + raise ValueError(f"Unsupported melody_source_id: {self.melody_source_id}") + + def _generate_random_score(self, text_list: list[str]): + st = 0 + score = [] + for lyric in text_list: + pitch = random.randint(57, 69) + duration = round(random.uniform(0.1, 0.5), 4) + ed = st + duration + score.append((st, ed, lyric, pitch)) + st = ed + return score + + def _align_text_to_score( + self, + text_list: list[str], + song_phrase_iterator: Iterator[dict], + align_type: str, + ): + score = [] + text_idx = 0 + + while text_idx < len(text_list): + reference = next(song_phrase_iterator) + for st, ed, ref_lyric, pitch in zip( + reference["note_start_times"], + reference["note_end_times"], + reference["note_lyrics"], + reference["note_midi"], + ): + assert ref_lyric not in [ + "", + "", + ], f"Proccessed {self.melody_source_id} score segments should not contain or ." # TODO: remove in PR, only for debug + if pitch == 0: + score.append((st, ed, ref_lyric, pitch)) + elif ref_lyric in ["-", "——"] and align_type == "lyric": + score.append((st, ed, ref_lyric, pitch)) + text_idx += 1 + else: + score.append((st, ed, text_list[text_idx], pitch)) + text_idx += 1 + return score diff --git a/modules/svs/__init__.py b/modules/svs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eeb594dbee34fa0adcad5ad6a043beb6ce379d9a --- /dev/null +++ b/modules/svs/__init__.py @@ -0,0 +1,10 @@ +from .base import AbstractSVSModel +from .registry import SVS_MODEL_REGISTRY, get_svs_model, register_svs_model +from .espnet import ESPNetSVS + +__all__ = [ + "AbstractSVSModel", + "get_svs_model", + "register_svs_model", + "SVS_MODEL_REGISTRY", +] diff --git a/modules/svs/base.py b/modules/svs/base.py new file mode 100644 index 0000000000000000000000000000000000000000..aaa5e846092e2a9ffa77a221ee49210fceb24beb --- /dev/null +++ b/modules/svs/base.py @@ -0,0 +1,21 @@ +from abc import ABC, abstractmethod + +import numpy as np + + +class AbstractSVSModel(ABC): + @abstractmethod + def __init__( + self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs + ): ... + + @abstractmethod + def synthesize( + self, + score: list[tuple[float, float, str, int]], + **kwargs, + ) -> tuple[np.ndarray, int]: + """ + Synthesize singing audio from music score. + """ + pass diff --git a/modules/svs/espnet.py b/modules/svs/espnet.py new file mode 100644 index 0000000000000000000000000000000000000000..c5a92cdcb4a5bf883e97735b5c985089d4a3852f --- /dev/null +++ b/modules/svs/espnet.py @@ -0,0 +1,123 @@ +from typing import Callable + +import numpy as np + +from modules.utils.g2p import ( + kana_to_phonemes_openjtalk, + pinyin_to_phonemes_ace, + pinyin_to_phonemes_opencpop, +) + +from .base import AbstractSVSModel +from .registry import register_svs_model + + +@register_svs_model("espnet/") +class ESPNetSVS(AbstractSVSModel): + def __init__(self, model_id: str, device="cpu", cache_dir="cache", **kwargs): + from espnet2.bin.svs_inference import SingingGenerate + from espnet_model_zoo.downloader import ModelDownloader + + print(f"Downloading {model_id} to {cache_dir}") # TODO: should improve log code + downloaded = ModelDownloader(cache_dir).download_and_unpack(model_id) + print(f"Downloaded {model_id} to {cache_dir}") # TODO: should improve log code + self.model = SingingGenerate( + train_config=downloaded["train_config"], + model_file=downloaded["model_file"], + device=device, + ) + self.model_id = model_id + self.output_sample_rate = self.model.fs + self.phoneme_mappers = self._build_phoneme_mappers() + + def _build_phoneme_mappers(self) -> dict[str, Callable[[str], list[str]]]: + if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain": + phoneme_mappers = { + "mandarin": pinyin_to_phonemes_opencpop, + } + elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained": + + def mandarin_mapper(pinyin: str) -> list[str]: + phns = pinyin_to_phonemes_ace(pinyin) + return [phn + "@zh" for phn in phns] + + def japanese_mapper(kana: str) -> list[str]: + phones = kana_to_phonemes_openjtalk(kana) + return [phn + "@jp" for phn in phones] + + phoneme_mappers = { + "mandarin": mandarin_mapper, + "japanese": japanese_mapper, + } + else: + phoneme_mappers = {} + return phoneme_mappers + + def _preprocess(self, score: list[tuple[float, float, str, int]], language: str): + if language not in self.phoneme_mappers: + raise ValueError(f"Unsupported language: {language} for {self.model_id}") + phoneme_mapper = self.phoneme_mappers[language] + + # text to phoneme + notes = [] + phns = [] + pre_phn = None + for st, ed, text, pitch in score: + assert text not in [ + "", + "", + ], f"Proccessed score segments should not contain or . {score}" # TODO: remove in PR, only for debug + if text == "AP" or text == "SP": + lyric_units = [text] + phn_units = [text] + elif text == "-" or text == "——": + lyric_units = [text] + if pre_phn is None: + raise ValueError( + f"Text `{text}` cannot be recognized by {self.model_id}. Lyrics cannot start with a lyric continuation symbol `-` or `——`" + ) + phn_units = [pre_phn] + else: + try: + lyric_units = phoneme_mapper(text) + except ValueError as e: + raise ValueError( + f"Text `{text}` cannot be recognized by {self.model_id}" + ) from e + phn_units = lyric_units + notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units))) + phns.extend(phn_units) + pre_phn = phn_units[-1] + + batch = { + "score": { + "tempo": 120, # does not affect svs result, as note durations are in time unit + "notes": notes, + }, + "text": " ".join(phns), + } + return batch + + def synthesize( + self, score: list[tuple[float, float, str, int]], language: str, **kwargs + ): + batch = self._preprocess(score, language) + if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain": + sid = np.array([int(kwargs["speaker"])]) + output_dict = self.model(batch, sids=sid) + elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained": + langs = { + "zh": 2, + "jp": 1, + } + if language not in langs: + raise ValueError( + f"Unsupported language: {language} for {self.model_id}" + ) + lid = np.array([langs[language]]) + spk_embed = np.load(kwargs["speaker"]) + output_dict = self.model(batch, lids=lid, spembs=spk_embed) + else: + raise NotImplementedError(f"Model {self.model_id} not supported") + wav_info = output_dict["wav"].cpu().numpy() + return wav_info, self.output_sample_rate diff --git a/modules/svs/registry.py b/modules/svs/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..369a4c510bd6147588a850938ec67d752eb13910 --- /dev/null +++ b/modules/svs/registry.py @@ -0,0 +1,19 @@ +from .base import AbstractSVSModel + +SVS_MODEL_REGISTRY = {} + + +def register_svs_model(prefix: str): + def wrapper(cls): + assert issubclass(cls, AbstractSVSModel), f"{cls} must inherit AbstractSVSModel" + SVS_MODEL_REGISTRY[prefix] = cls + return cls + + return wrapper + + +def get_svs_model(model_id: str, device="cpu", **kwargs) -> AbstractSVSModel: + for prefix, cls in SVS_MODEL_REGISTRY.items(): + if model_id.startswith(prefix): + return cls(model_id, device=device, **kwargs) + raise ValueError(f"No SVS wrapper found for model: {model_id}") diff --git a/modules/utils/g2p.py b/modules/utils/g2p.py new file mode 100644 index 0000000000000000000000000000000000000000..b6205714e6fde0ac936492a3afdfde3402404cb4 --- /dev/null +++ b/modules/utils/g2p.py @@ -0,0 +1,175 @@ +import json +import re +import warnings +from pathlib import Path + +from kanjiconv import KanjiConv +from pypinyin import lazy_pinyin + +from .resources.pinyin_dict import PINYIN_DICT + +kanji_to_kana = KanjiConv() + +yoon_map = { + "ぁ": "あ", + "ぃ": "い", + "ぅ": "う", + "ぇ": "え", + "ぉ": "お", + "ゃ": "や", + "ゅ": "ゆ", + "ょ": "よ", + "ゎ": "わ", +} + +# ACE_phonemes +with open(Path(__file__).parent / "resources" / "all_plans.json", "r") as f: + ace_phonemes_all_plans = json.load(f) +for plan in ace_phonemes_all_plans["plans"]: + if plan["language"] == "zh": + ace_phonemes_zh_plan = plan + break + + +def preprocess_text(text: str, language: str) -> list[str]: + if language == "mandarin": + text_list = to_pinyin(text) + elif language == "japanese": + text_list = to_kana(text) + else: + raise ValueError(f"Other languages are not supported") + return text_list + + +def to_pinyin(text: str) -> list[str]: + pinyin_list = lazy_pinyin(text) + text_list = [] + for text in pinyin_list: + if text[0] == "S" or text[0] == "A" or text[0] == "-": + sp_strs = re.findall(r"-|AP|SP", text) + for phn in sp_strs: + text_list.append(phn) + else: + text_list.append(text) + return text_list + + +def replace_chouonpu(hiragana_text: str) -> str: + """process「ー」since the previous packages didn't support""" + vowels = { + "あ": "あ", + "い": "い", + "う": "う", + "え": "え", + "お": "う", + "か": "あ", + "き": "い", + "く": "う", + "け": "え", + "こ": "う", + "さ": "あ", + "し": "い", + "す": "う", + "せ": "え", + "そ": "う", + "た": "あ", + "ち": "い", + "つ": "う", + "て": "え", + "と": "う", + "な": "あ", + "に": "い", + "ぬ": "う", + "ね": "え", + "の": "う", + "は": "あ", + "ひ": "い", + "ふ": "う", + "へ": "え", + "ほ": "う", + "ま": "あ", + "み": "い", + "む": "う", + "め": "え", + "も": "う", + "や": "あ", + "ゆ": "う", + "よ": "う", + "ら": "あ", + "り": "い", + "る": "う", + "れ": "え", + "ろ": "う", + "わ": "あ", + "を": "う", + } + new_text = [] + for i, char in enumerate(hiragana_text): + if char == "ー" and i > 0: + prev_char = new_text[-1] + if prev_char in yoon_map: + prev_char = yoon_map[prev_char] + new_text.append(vowels.get(prev_char, prev_char)) + else: + new_text.append(char) + return "".join(new_text) + + +def to_kana(text: str) -> list[str]: + hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", "")) + hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ") + final_ls = [] + for subword in hiragana_text_wl: + sl_prev = 0 + for i in range(len(subword) - 1): + if sl_prev >= len(subword) - 1: + break + sl = sl_prev + 1 + if subword[sl] in yoon_map: + final_ls.append(subword[sl_prev : sl + 1]) + sl_prev += 2 + else: + final_ls.append(subword[sl_prev]) + sl_prev += 1 + final_ls.append(subword[sl_prev]) + return final_ls + + +def kana_to_phonemes_openjtalk(kana: str) -> list[str]: + import pyopenjtalk + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + # add space between each character + kana = " ".join(list(kana)) + # phones is a str object separated by space + phones = pyopenjtalk.g2p(kana, kana=False) + if len(w) > 0: + for warning in w: + if "No phoneme" in str(warning.message): + raise ValueError(f"No phoneme found for {kana}. {warning.message}") + phones = phones.split(" ") + return phones + + +def pinyin_to_phonemes_opencpop(pinyin: str) -> list[str]: + pinyin = pinyin.lower() + if pinyin in ace_phonemes_zh_plan["dict"]: + phns = ace_phonemes_zh_plan["dict"][pinyin] + return phns + elif pinyin in ace_phonemes_zh_plan["syllable_alias"]: + phns = ace_phonemes_zh_plan["dict"][ + ace_phonemes_zh_plan["syllable_alias"][pinyin] + ] + return phns + else: + raise ValueError(f"{pinyin} not registered in Opencpop phoneme dict") + + +def pinyin_to_phonemes_ace(pinyin: str) -> list[str]: + pinyin = pinyin.lower() + if pinyin in PINYIN_DICT: + phns = PINYIN_DICT[pinyin] + return phns + else: + raise ValueError(f"{pinyin} not registered in ACE phoneme dict") diff --git a/resource/all_plans.json b/modules/utils/resources/all_plans.json similarity index 100% rename from resource/all_plans.json rename to modules/utils/resources/all_plans.json diff --git a/resource/pinyin_dict.py b/modules/utils/resources/pinyin_dict.py similarity index 100% rename from resource/pinyin_dict.py rename to modules/utils/resources/pinyin_dict.py diff --git a/modules/utils/text_normalize.py b/modules/utils/text_normalize.py new file mode 100644 index 0000000000000000000000000000000000000000..47a205aa0048e8bd0bee52805ba52045b78418cf --- /dev/null +++ b/modules/utils/text_normalize.py @@ -0,0 +1,31 @@ +import re +from typing import Optional + + +def remove_non_zh_jp(text: str) -> str: + pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef]" + return re.sub(pattern, "", text) + + +def truncate_sentences(text: str, max_sentences: int) -> str: + sentences = re.split(r"(?<=[。!?])", text) + return "".join(sentences[:max_sentences]).strip() + + +def clean_llm_output( + text: str, + max_sentences: Optional[int] = 2, + seg_syb: str = " ", + language: str = "mandarin", +) -> str: + if language not in ["mandarin", "japanese"]: + raise NotImplementedError(f"Unsupported language: {language}") + text = text.strip() + if max_sentences is not None: + text = truncate_sentences(text, max_sentences) + text = remove_non_zh_jp(text) + text = re.sub(r"[^\w\s\u4e00-\u9fff]", " ", text) # Remove punctuation + text = re.sub(r"\s+", " ", text) # Normalize whitespace + text = text.replace("\n", seg_syb) + text = text.replace(" ", seg_syb) + return text diff --git a/offline_process/create_features.py b/offline_process/create_features.py deleted file mode 100644 index 61bf4e459119da2592b24f3c564d13fb76ecaa89..0000000000000000000000000000000000000000 --- a/offline_process/create_features.py +++ /dev/null @@ -1,71 +0,0 @@ -from datasets import load_dataset, concatenate_datasets - -ds = load_dataset("espnet/ace-kising-segments", cache_dir="cache") - -combined = concatenate_datasets([ds["train"], ds["validation"], ds["test"]]) - -# 2. filter rows by singer: baber -combined = combined.filter(lambda x: x["singer"] == "barber") - -# 3. create a new column, which counts the nonzero numbers in the list in the note_midi column -combined = combined.map( - lambda x: { - "note_midi_length": len([n for n in x["note_midi"] if n != 0]), - "lyric_word_length": len( - [word for word in x["note_lyrics"] if word not in ["", "", "-"]] - ), # counts the number of actual words (or characters for, e.g., Chinese/Japanese) - } -) -combined = combined.map( - lambda x: { - "lyric_word_length": len( - [word for word in x["note_lyrics"] if word not in ["", "", "-"]] - ) - } # counts the number of actual words (or characters for, e.g., Chinese/Japanese) -) - -# 4. sort by segment_id -combined = combined.sort("segment_id") - -# 5. iterate over rows -prev_songid = None -prev_song_segment_id = None -song2note_lengths = {} -song2word_lengths = {} -for row in combined: - # segment_id: kising_barber_{songid}_{song_segment_id} - _, _, songid, song_segment_id = row["segment_id"].split("_") - if prev_songid != songid: - if prev_songid is not None: - assert ( - song_segment_id == "001" - ), f"prev_songid: {prev_songid}, songid: {songid}, song_segment_id: {song_segment_id}" - song2note_lengths[f"kising_{songid}"] = [row["note_midi_length"]] - song2word_lengths[f"kising_{songid}"] = [row["lyric_word_length"]] - else: - assert ( - int(song_segment_id) >= int(prev_song_segment_id) + 1 - ), f"prev_song_segment_id: {prev_song_segment_id}, song_segment_id: {song_segment_id}" - song2note_lengths[f"kising_{songid}"].append(row["note_midi_length"]) - song2word_lengths[f"kising_{songid}"].append(row["lyric_word_length"]) - prev_songid = songid - prev_song_segment_id = song_segment_id - -# 6. write to json -import json - -with open("data/song2note_lengths.json", "w") as f: - json.dump(song2note_lengths, f, indent=4) - -with open("data/song2word_lengths.json", "w") as f: - json.dump(song2word_lengths, f, indent=4) - -# 7. push score segments to hub -# remove audio and singer columns -combined = combined.remove_columns(["audio", "singer"]) -# replace kising_barber_ with kising_ -combined = combined.map( - lambda x: {"segment_id": x["segment_id"].replace("kising_barber_", "kising_")} -) -# upload to hub -combined.push_to_hub("jhansss/kising_score_segments") diff --git a/path.sh b/path.sh deleted file mode 100644 index 40b2e6ce81714f363a2c7422e453c3e13e8d7b3b..0000000000000000000000000000000000000000 --- a/path.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -. ~/workspace/SingingSDS/activate_python.sh \ No newline at end of file diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..81438ecb454ac317b73eef036670a2b52b020e55 --- /dev/null +++ b/pipeline.py @@ -0,0 +1,103 @@ +import torch +import time +import librosa + +from modules.asr import get_asr_model +from modules.llm import get_llm_model +from modules.svs import get_svs_model +from evaluation.svs_eval import load_evaluators, run_evaluation +from modules.melody import MelodyController +from modules.utils.text_normalize import clean_llm_output + + +class SingingDialoguePipeline: + def __init__(self, config: dict): + if "device" in config: + self.device = config["device"] + else: + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.cache_dir = config["cache_dir"] + self.asr = get_asr_model( + config["asr_model"], device=self.device, cache_dir=self.cache_dir + ) + self.llm = get_llm_model( + config["llm_model"], device=self.device, cache_dir=self.cache_dir + ) + self.svs = get_svs_model( + config["svs_model"], device=self.device, cache_dir=self.cache_dir + ) + self.melody_controller = MelodyController( + config["melody_source"], self.cache_dir + ) + self.track_latency = config.get("track_latency", False) + self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", [])) + + def set_asr_model(self, asr_model: str): + self.asr = get_asr_model( + asr_model, device=self.device, cache_dir=self.cache_dir + ) + + def set_llm_model(self, llm_model: str): + self.llm = get_llm_model( + llm_model, device=self.device, cache_dir=self.cache_dir + ) + + def set_svs_model(self, svs_model: str): + self.svs = get_svs_model( + svs_model, device=self.device, cache_dir=self.cache_dir + ) + + def set_melody_controller(self, melody_source: str): + self.melody_controller = MelodyController(melody_source, self.cache_dir) + + def run( + self, + audio_path, + language, + prompt_template, + svs_inference_kwargs, + max_new_tokens=100, + ): + if self.track_latency: + asr_start_time = time.time() + audio_array, audio_sample_rate = librosa.load(audio_path, sr=16000) + asr_result = self.asr.transcribe( + audio_array, audio_sample_rate=audio_sample_rate, language=language + ) + if self.track_latency: + asr_end_time = time.time() + asr_latency = asr_end_time - asr_start_time + melody_prompt = self.melody_controller.get_melody_constraints() + prompt = prompt_template.format(melody_prompt, asr_result) + if self.track_latency: + llm_start_time = time.time() + output = self.llm.generate(prompt, max_new_tokens=max_new_tokens) + if self.track_latency: + llm_end_time = time.time() + llm_latency = llm_end_time - llm_start_time + print(f"llm output: {output}确认一下是不是不含prompt的") + llm_response = clean_llm_output(output, language=language) + score = self.melody_controller.generate_score(llm_response, language) + if self.track_latency: + svs_start_time = time.time() + singing_audio, sample_rate = self.svs.synthesize( + score, language=language, **svs_inference_kwargs + ) + if self.track_latency: + svs_end_time = time.time() + svs_latency = svs_end_time - svs_start_time + results = { + "asr_text": asr_result, + "llm_text": llm_response, + "svs_audio": (singing_audio, sample_rate), + } + if self.track_latency: + results["metrics"].update({ + "asr_latency": asr_latency, + "llm_latency": llm_latency, + "svs_latency": svs_latency, + }) + return results + + def evaluate(self, audio, sample_rate): + return run_evaluation(audio, sample_rate, self.evaluators) diff --git a/resource/__init__.py b/resources/__init__.py similarity index 100% rename from resource/__init__.py rename to resources/__init__.py diff --git a/resources/all_plans.json b/resources/all_plans.json new file mode 100755 index 0000000000000000000000000000000000000000..94af8af22ad17cadfaf4505caae44c71fff6561e --- /dev/null +++ b/resources/all_plans.json @@ -0,0 +1,7660 @@ +{ + "jp_word2romaji": + { + "うぁ": "wa", + "うぉ": "wo", + "ぁ": "a", + "ぃ": "i", + "ぅ": "u", + "ぇ": "e", + "ぉ": "o", + "ゃ": "ya", + "ゅ": "yu", + "ょ": "yo", + "ウァ": "wa", + "ウォ": "wo", + "ァ": "a", + "ィ": "i", + "ゥ": "u", + "ェ": "e", + "ォ": "o", + "ャ": "ya", + "ュ": "yu", + "ョ": "yo", + "ウァ": "wa", + "ウォ": "wo", + "ァ": "a", + "ィ": "i", + "ゥ": "u", + "ェ": "e", + "ォ": "o", + "ャ": "ya", + "ュ": "yu", + "ョ": "yo", + "よ": "yo", + "びゅ": "byu", + "ら": "ra", + "ジャ": "ja", + "り": "ri", + "キャ": "kya", + "ツェ": "tse", + "る": "ru", + "キュ": "kyu", + "フェ": "fe", + "れ": "re", + "びょ": "byo", + "ジュ": "ju", + "ろ": "ro", + "キュ": "kyu", + "リェ": "rye", + "ツォ": "tso", + "リャ": "rya", + "キョ": "kyo", + "フォ": "fo", + "わ": "wa", + "コ゚": "ngo", + "ジョ": "jo", + "リュ": "ryu", + "キョ": "kyo", + "リョ": "ryo", + "を": "wo", + "ク゚": "ngu", + "ん": "N", + "キ゚ャ": "ngya", + "キ゚ェ": "ngye", + "ガ": "ga", + "ふぁ": "fa", + "クァ": "kwa", + "キ゚ュ": "ngyu", + "ブ": "bu", + "ふぃ": "fi", + "キ゚ャ": "ngya", + "プ": "pu", + "クィ": "kwi", + "ア": "a", + "ふぅ": "fu", + "ズィ": "zi", + "きゃ": "kya", + "じゃ": "ja", + "ズィ": "zi", + "イ": "i", + "キ゚ョ": "ngyo", + "キ゚ュ": "ngyu", + "ブュ": "vyu", + "スィ": "si", + "ふぇ": "fe", + "ウ": "u", + "きゅ": "kyu", + "じゅ": "ju", + "イェ": "ye", + "イェ": "ye", + "キ゚ョ": "ngyo", + "エ": "e", + "きょ": "kyo", + "ふぉ": "fo", + "じょ": "jo", + "オ": "o", + "カ": "ka", + "ガ": "ga", + "キ": "ki", + "ピャ": "pya", + "ギ": "gi", + "ク": "ku", + "ミャ": "mya", + "グ": "gu", + "リャ": "rya", + "ケ": "ke", + "ピュ": "pyu", + "ゲ": "ge", + "ミュ": "myu", + "コ": "ko", + "リュ": "ryu", + "ゴ": "go", + "ピョ": "pyo", + "サ": "sa", + "ミョ": "myo", + "ザ": "za", + "ドゥ": "du", + "リョ": "ryo", + "シ": "shi", + "くぁ": "kwa", + "ジ": "ji", + "ス": "su", + "ズ": "zu", + "セ": "se", + "くぃ": "kwi", + "ゼ": "ze", + "ずぃ": "zi", + "ソ": "so", + "ジ": "ji", + "ゾ": "zo", + "タ": "ta", + "ダ": "da", + "いぇ": "ye", + "チ": "chi", + "ヲ": "wo", + "くぇ": "kwe", + "ッ": "cl", + "ツ": "tsu", + "クァ": "kwa", + "くぉ": "kwo", + "クィ": "kwi", + "テ": "te", + "ティ": "ti", + "デ": "de", + "ぴゃ": "pya", + "ト": "to", + "き゚": "ngi", + "ド": "do", + "みゃ": "mya", + "ギャ": "gya", + "りゃ": "rya", + "ナ": "na", + "ぴゅ": "pyu", + "ニ": "ni", + "テェ": "tye", + "ヴぁ": "va", + "ア": "a", + "ヌ": "nu", + "ネ": "ne", + "イ": "i", + "りゅ": "ryu", + "テャ": "tya", + "ギュ": "gyu", + "ぴょ": "pyo", + "ノ": "no", + "ヴぃ": "vi", + "ハ": "ha", + "バ": "ba", + "みゅ": "myu", + "ギョ": "gyo", + "りょ": "ryo", + "カ": "ka", + "パ": "pa", + "ヒ": "hi", + "キ": "ki", + "みょ": "myo", + "ビ": "bi", + "ニェ": "nye", + "ビャ": "bya", + "ピ": "pi", + "ウ": "u", + "テュ": "tyu", + "フ": "hu", + "ビュ": "byu", + "ク": "ku", + "ヴぇ": "ve", + "サ": "sa", + "ブ": "bu", + "プ": "pu", + "シ": "shi", + "エ": "e", + "ヘ": "he", + "ス": "su", + "ケ": "ke", + "ベ": "be", + "ヴぉ": "vo", + "セ": "se", + "ペ": "pe", + "ジェ": "je", + "ビェ": "bye", + "ホ": "ho", + "タ": "ta", + "オ": "o", + "ボ": "bo", + "チ": "chi", + "コ": "ko", + "ポ": "po", + "ツ": "tsu", + "ギ": "gi", + "マ": "ma", + "テ": "te", + "ソ": "so", + "ミ": "mi", + "ト": "to", + "ジャ": "ja", + "グァ": "gwa", + "ナ": "na", + "ム": "mu", + "メ": "me", + "ニ": "ni", + "ベ": "be", + "モ": "mo", + "ヌ": "nu", + "ジュ": "ju", + "ペ": "pe", + "グィ": "gwi", + "ネ": "ne", + "ヤ": "ya", + "ぎゃ": "gya", + "ディ": "di", + "ノ": "no", + "ハ": "ha", + "ジョ": "jo", + "ウィ": "wi", + "ヒ": "hi", + "にぇ": "nye", + "ティ": "ti", + "ユ": "yu", + "ぎゅ": "gyu", + "ヨ": "yo", + "フ": "hu", + "ヘ": "he", + "ラ": "ra", + "ウェ": "we", + "ホ": "ho", + "こ゚": "ngo", + "マ": "ma", + "リ": "ri", + "ル": "ru", + "ミ": "mi", + "ぎょ": "gyo", + "レ": "re", + "ム": "mu", + "テェ": "tye", + "ロ": "ro", + "グォ": "gwo", + "メ": "me", + "デェ": "de", + "カ゚": "nga", + "モ": "mo", + "ワ": "wa", + "ヤ": "ya", + "フャ": "fya", + "ビョ": "byo", + "ユ": "yu", + "デャ": "dya", + "ッ": "cl", + "ヨ": "yo", + "ヲ": "wo", + "ラ": "ra", + "フュ": "fyu", + "リ": "ri", + "ン": "N", + "ヴ": "vu", + "ル": "ru", + "カ゚": "nga", + "デュ": "dyu", + "レ": "re", + "フョ": "fyo", + "ロ": "ro", + "デョ": "tyo", + "ワ": "wa", + "ン": "N", + "ピェ": "pye", + "ぐぁ": "gwa", + "き゚ゃ": "ngya", + "ぐぃ": "gwi", + "ピャ": "pya", + "ズ": "zu", + "てぃ": "ti", + "ピュ": "pyu", + "き゚ゅ": "ngyu", + "ぐぇ": "gwe", + "ピョ": "pyo", + "てぇ": "tye", + "ぐぉ": "gwo", + "き゚ょ": "ngyo", + "ふゃ": "fya", + "トゥ": "tu", + "ふゅ": "fyu", + "チェ": "che", + "ふょ": "fyo", + "ケ゚": "nge", + "グ": "gu", + "ヴァ": "va", + "ボ": "bo", + "ウィ": "wi", + "ポ": "po", + "ヴィ": "vi", + "ちぇ": "che", + "ディ": "di", + "ウェ": "we", + "ヴェ": "ve", + "ヴォ": "vo", + "デェ": "dye", + "ヒェ": "hye", + "キ゚": "ngi", + "ニャ": "nya", + "ヴャ": "vya", + "ニュ": "nyu", + "ブュ": "vyu", + "ニョ": "nyo", + "ヴゃ": "vya", + "ヴゅ": "vyu", + "うぃ": "wi", + "ヴ": "vu", + "ヴょ": "vyo", + "ゼ": "ze", + "でぃ": "di", + "うぇ": "we", + "でぇ": "dye", + "ひぇ": "hye", + "にゃ": "nya", + "く゚": "ngu", + "にゅ": "nyu", + "シェ": "she", + "キ゚ェ": "ngye", + "コォ": "kwo", + "ミェ": "mye", + "テャ": "tya", + "にょ": "nyo", + "ミャ": "mya", + "テュ": "tyu", + "ミュ": "myu", + "ミョ": "myo", + "ゲ": "ge", + "ド": "do", + "しぇ": "she", + "ぢぇ": "dye", + "てゃ": "tya", + "ヴァ": "va", + "トゥ": "tu", + "てゅ": "tyu", + "ヴィ": "vi", + "ヒェ": "hye", + "チャ": "cha", + "てょ": "tyo", + "ビェ": "bye", + "ク゚": "ngu", + "ヒャ": "hya", + "チュ": "chu", + "ヒュ": "hyu", + "ヴェ": "ve", + "ヒョ": "hyo", + "チョ": "cho", + "キ゚": "ngi", + "ヴォ": "vo", + "ゾ": "zo", + "バ": "ba", + "パ": "pa", + "とぅ": "tu", + "ちゃ": "cha", + "びぇ": "bye", + "ちゅ": "chu", + "キェ": "kye", + "ジェ": "je", + "ニェ": "nye", + "ちょ": "cho", + "デャ": "dya", + "ニャ": "nya", + "ヒャ": "hya", + "ニュ": "nyu", + "デュ": "dyu", + "グァ": "gwa", + "ニョ": "nyo", + "ヒュ": "hyu", + "グィ": "gwi", + "デョ": "tyo", + "コ゚": "ngo", + "ヒョ": "hyo", + "グォ": "gwo", + "ゴ": "go", + "きぇ": "kye", + "じぇ": "je", + "あ": "a", + "でゃ": "dya", + "ファ": "fa", + "ひゃ": "hya", + "い": "i", + "フィ": "fi", + "か゚": "nga", + "ドゥ": "du", + "でゅ": "dyu", + "う": "u", + "フゥ": "fu", + "チェ": "che", + "ひゅ": "hyu", + "シャ": "sha", + "フェ": "fe", + "き゚ぇ": "ngye", + "え": "e", + "コォ": "kwo", + "フォ": "fo", + "チャ": "cha", + "ピェ": "pye", + "ひょ": "hyo", + "シュ": "shu", + "か": "ka", + "フャ": "fya", + "ミェ": "mye", + "が": "ga", + "リェ": "rye", + "お": "o", + "き": "ki", + "チュ": "chu", + "フュ": "fyu", + "ぎ": "gi", + "ショ": "sho", + "ケ゚": "nge", + "く": "ku", + "チョ": "cho", + "フョ": "fyo", + "ぐ": "gu", + "け": "ke", + "げ": "ge", + "こ": "ko", + "ご": "go", + "さ": "sa", + "ざ": "za", + "し": "shi", + "じ": "ji", + "す": "su", + "ず": "zu", + "せ": "se", + "ダ": "da", + "ぜ": "ze", + "ビ": "bi", + "そ": "so", + "ぞ": "zo", + "ピ": "pi", + "ツァ": "tsa", + "た": "ta", + "どぅ": "du", + "だ": "da", + "スィ": "si", + "ち": "chi", + "しゃ": "sha", + "ツィ": "tsi", + "っ": "cl", + "ぴぇ": "pye", + "つ": "tsu", + "しゅ": "shu", + "みぇ": "mye", + "て": "te", + "け゚": "nge", + "りぇ": "rye", + "で": "de", + "ギェ": "gye", + "シェ": "she", + "と": "to", + "しょ": "sho", + "ツェ": "tse", + "ど": "do", + "ぢょ": "dyo", + "ギェ": "gye", + "な": "na", + "シャ": "sha", + "に": "ni", + "ビャ": "bya", + "ぬ": "nu", + "ツォ": "tso", + "ギャ": "gya", + "ね": "ne", + "シュ": "shu", + "ショ": "sho", + "の": "no", + "は": "ha", + "ビュ": "byu", + "ヴャ": "vya", + "ば": "ba", + "ギュ": "gyu", + "ぱ": "pa", + "ひ": "hi", + "ビョ": "byo", + "ギョ": "gyo", + "び": "bi", + "ぴ": "pi", + "ふ": "hu", + "ぶ": "bu", + "ぷ": "pu", + "へ": "he", + "つぁ": "tsa", + "べ": "be", + "ぺ": "pe", + "すぃ": "si", + "ほ": "ho", + "ザ": "za", + "つぃ": "tsi", + "ぼ": "bo", + "ぽ": "po", + "ま": "ma", + "み": "mi", + "む": "mu", + "ぎぇ": "gye", + "め": "me", + "ファ": "fa", + "も": "mo", + "つぇ": "tse", + "ツァ": "tsa", + "や": "ya", + "ツィ": "tsi", + "フィ": "fi", + "びゃ": "bya", + "つぉ": "tso", + "ゆ": "yu", + "キェ": "kye", + "キャ": "kya", + "フゥ": "fu", + "づ": "zu" + }, + "plans": + [ + { + "language": "zh", + "syllable_alias": + { + "lue": "lve", + "m": "mm", + "n": "nn", + "yvan": "yuan", + "yv": "yu", + "qvan": "quan", + "xv": "xu", + "qv": "qu", + "jve": "jue", + "xvn": "xun", + "qve": "que", + "xvan": "xuan", + "qvn": "qun", + "jvn": "jun", + "xve": "xue", + "yve": "yue", + "yvn": "yun", + "jvan": "juan" + }, + "dict": + { + "zi": + [ + "z", + "ii" + ], + "ci": + [ + "c", + "ii" + ], + "si": + [ + "s", + "ii" + ], + "ii": + [ + "ii" + ], + "zhi": + [ + "zh", + "iii" + ], + "chi": + [ + "ch", + "iii" + ], + "shi": + [ + "sh", + "iii" + ], + "ri": + [ + "r", + "iii" + ], + "iii": + [ + "iii" + ], + "bi": + [ + "b", + "i" + ], + "pi": + [ + "p", + "i" + ], + "mi": + [ + "m", + "i" + ], + "fi": + [ + "f", + "i" + ], + "di": + [ + "d", + "i" + ], + "ti": + [ + "t", + "i" + ], + "ni": + [ + "n", + "i" + ], + "li": + [ + "l", + "i" + ], + "zyi": + [ + "z", + "i" + ], + "cyi": + [ + "c", + "i" + ], + "syi": + [ + "s", + "i" + ], + "zhyi": + [ + "zh", + "i" + ], + "chyi": + [ + "ch", + "i" + ], + "shyi": + [ + "sh", + "i" + ], + "ji": + [ + "j", + "i" + ], + "qi": + [ + "q", + "i" + ], + "xi": + [ + "x", + "i" + ], + "gi": + [ + "g", + "i" + ], + "ki": + [ + "k", + "i" + ], + "hi": + [ + "h", + "i" + ], + "ryi": + [ + "r", + "i" + ], + "yi": + [ + "y", + "i" + ], + "i": + [ + "i" + ], + "bu": + [ + "b", + "u" + ], + "pu": + [ + "p", + "u" + ], + "mu": + [ + "m", + "u" + ], + "fu": + [ + "f", + "u" + ], + "du": + [ + "d", + "u" + ], + "tu": + [ + "t", + "u" + ], + "nu": + [ + "n", + "u" + ], + "lu": + [ + "l", + "u" + ], + "zu": + [ + "z", + "u" + ], + "cu": + [ + "c", + "u" + ], + "su": + [ + "s", + "u" + ], + "zhu": + [ + "zh", + "u" + ], + "chu": + [ + "ch", + "u" + ], + "shu": + [ + "sh", + "u" + ], + "gu": + [ + "g", + "u" + ], + "ku": + [ + "k", + "u" + ], + "hu": + [ + "h", + "u" + ], + "ru": + [ + "r", + "u" + ], + "wu": + [ + "w", + "u" + ], + "u": + [ + "u" + ], + "bv": + [ + "b", + "v" + ], + "pv": + [ + "p", + "v" + ], + "mv": + [ + "m", + "v" + ], + "fv": + [ + "f", + "v" + ], + "dv": + [ + "d", + "v" + ], + "tv": + [ + "t", + "v" + ], + "nv": + [ + "n", + "v" + ], + "lv": + [ + "l", + "v" + ], + "zv": + [ + "z", + "v" + ], + "cv": + [ + "c", + "v" + ], + "sv": + [ + "s", + "v" + ], + "zhv": + [ + "zh", + "v" + ], + "chv": + [ + "ch", + "v" + ], + "shv": + [ + "sh", + "v" + ], + "ju": + [ + "j", + "v" + ], + "qu": + [ + "q", + "v" + ], + "xu": + [ + "x", + "v" + ], + "gv": + [ + "g", + "v" + ], + "kv": + [ + "k", + "v" + ], + "hv": + [ + "h", + "v" + ], + "rv": + [ + "r", + "v" + ], + "yu": + [ + "y", + "v" + ], + "v": + [ + "v" + ], + "ba": + [ + "b", + "a" + ], + "pa": + [ + "p", + "a" + ], + "ma": + [ + "m", + "a" + ], + "fa": + [ + "f", + "a" + ], + "da": + [ + "d", + "a" + ], + "ta": + [ + "t", + "a" + ], + "na": + [ + "n", + "a" + ], + "la": + [ + "l", + "a" + ], + "za": + [ + "z", + "a" + ], + "ca": + [ + "c", + "a" + ], + "sa": + [ + "s", + "a" + ], + "zha": + [ + "zh", + "a" + ], + "cha": + [ + "ch", + "a" + ], + "sha": + [ + "sh", + "a" + ], + "ga": + [ + "g", + "a" + ], + "ka": + [ + "k", + "a" + ], + "ha": + [ + "h", + "a" + ], + "ra": + [ + "r", + "a" + ], + "a": + [ + "a" + ], + "bia": + [ + "b", + "ia" + ], + "pia": + [ + "p", + "ia" + ], + "mia": + [ + "m", + "ia" + ], + "fia": + [ + "f", + "ia" + ], + "dia": + [ + "d", + "ia" + ], + "tia": + [ + "t", + "ia" + ], + "nia": + [ + "n", + "ia" + ], + "lia": + [ + "l", + "ia" + ], + "zia": + [ + "z", + "ia" + ], + "cia": + [ + "c", + "ia" + ], + "sia": + [ + "s", + "ia" + ], + "jia": + [ + "j", + "ia" + ], + "qia": + [ + "q", + "ia" + ], + "xia": + [ + "x", + "ia" + ], + "gia": + [ + "g", + "ia" + ], + "kia": + [ + "k", + "ia" + ], + "hia": + [ + "h", + "ia" + ], + "ya": + [ + "y", + "ia" + ], + "ia": + [ + "ia" + ], + "bua": + [ + "b", + "ua" + ], + "pua": + [ + "p", + "ua" + ], + "mua": + [ + "m", + "ua" + ], + "fua": + [ + "f", + "ua" + ], + "dua": + [ + "d", + "ua" + ], + "tua": + [ + "t", + "ua" + ], + "nua": + [ + "n", + "ua" + ], + "lua": + [ + "l", + "ua" + ], + "zua": + [ + "z", + "ua" + ], + "cua": + [ + "c", + "ua" + ], + "sua": + [ + "s", + "ua" + ], + "zhua": + [ + "zh", + "ua" + ], + "chua": + [ + "ch", + "ua" + ], + "shua": + [ + "sh", + "ua" + ], + "gua": + [ + "g", + "ua" + ], + "kua": + [ + "k", + "ua" + ], + "hua": + [ + "h", + "ua" + ], + "rua": + [ + "r", + "ua" + ], + "wa": + [ + "w", + "ua" + ], + "ua": + [ + "ua" + ], + "beh": + [ + "b", + "eh" + ], + "peh": + [ + "p", + "eh" + ], + "meh": + [ + "m", + "eh" + ], + "feh": + [ + "f", + "eh" + ], + "deh": + [ + "d", + "eh" + ], + "teh": + [ + "t", + "eh" + ], + "neh": + [ + "n", + "eh" + ], + "leh": + [ + "l", + "eh" + ], + "zeh": + [ + "z", + "eh" + ], + "ceh": + [ + "c", + "eh" + ], + "seh": + [ + "s", + "eh" + ], + "zheh": + [ + "zh", + "eh" + ], + "cheh": + [ + "ch", + "eh" + ], + "sheh": + [ + "sh", + "eh" + ], + "geh": + [ + "g", + "eh" + ], + "keh": + [ + "k", + "eh" + ], + "heh": + [ + "h", + "eh" + ], + "reh": + [ + "r", + "eh" + ], + "eh": + [ + "eh" + ], + "be": + [ + "b", + "e" + ], + "pe": + [ + "p", + "e" + ], + "me": + [ + "m", + "e" + ], + "fe": + [ + "f", + "e" + ], + "de": + [ + "d", + "e" + ], + "te": + [ + "t", + "e" + ], + "ne": + [ + "n", + "e" + ], + "le": + [ + "l", + "e" + ], + "ze": + [ + "z", + "e" + ], + "ce": + [ + "c", + "e" + ], + "se": + [ + "s", + "e" + ], + "zhe": + [ + "zh", + "e" + ], + "che": + [ + "ch", + "e" + ], + "she": + [ + "sh", + "e" + ], + "ge": + [ + "g", + "e" + ], + "ke": + [ + "k", + "e" + ], + "he": + [ + "h", + "e" + ], + "re": + [ + "r", + "e" + ], + "e": + [ + "e" + ], + "bo": + [ + "b", + "uo" + ], + "po": + [ + "p", + "uo" + ], + "mo": + [ + "m", + "uo" + ], + "fo": + [ + "f", + "uo" + ], + "duo": + [ + "d", + "uo" + ], + "tuo": + [ + "t", + "uo" + ], + "nuo": + [ + "n", + "uo" + ], + "luo": + [ + "l", + "uo" + ], + "zuo": + [ + "z", + "uo" + ], + "cuo": + [ + "c", + "uo" + ], + "suo": + [ + "s", + "uo" + ], + "zhuo": + [ + "zh", + "uo" + ], + "chuo": + [ + "ch", + "uo" + ], + "shuo": + [ + "sh", + "uo" + ], + "guo": + [ + "g", + "uo" + ], + "kuo": + [ + "k", + "uo" + ], + "huo": + [ + "h", + "uo" + ], + "ruo": + [ + "r", + "uo" + ], + "wo": + [ + "w", + "uo" + ], + "uo": + [ + "uo" + ], + "no": + [ + "n", + "o" + ], + "lo": + [ + "l", + "o" + ], + "so": + [ + "s", + "o" + ], + "go": + [ + "g", + "o" + ], + "ko": + [ + "k", + "o" + ], + "ho": + [ + "h", + "o" + ], + "o": + [ + "o" + ], + "yo": + [ + "y", + "io" + ], + "io": + [ + "io" + ], + "bae": + [ + "b", + "ae" + ], + "pae": + [ + "p", + "ae" + ], + "mae": + [ + "m", + "ae" + ], + "fae": + [ + "f", + "ae" + ], + "dae": + [ + "d", + "ae" + ], + "tae": + [ + "t", + "ae" + ], + "nae": + [ + "n", + "ae" + ], + "lae": + [ + "l", + "ae" + ], + "zae": + [ + "z", + "ae" + ], + "cae": + [ + "c", + "ae" + ], + "sae": + [ + "s", + "ae" + ], + "zhae": + [ + "zh", + "ae" + ], + "chae": + [ + "ch", + "ae" + ], + "shae": + [ + "sh", + "ae" + ], + "gae": + [ + "g", + "ae" + ], + "kae": + [ + "k", + "ae" + ], + "hae": + [ + "h", + "ae" + ], + "rae": + [ + "r", + "ae" + ], + "ae": + [ + "ae" + ], + "bie": + [ + "b", + "ie" + ], + "pie": + [ + "p", + "ie" + ], + "mie": + [ + "m", + "ie" + ], + "fie": + [ + "f", + "ie" + ], + "die": + [ + "d", + "ie" + ], + "tie": + [ + "t", + "ie" + ], + "nie": + [ + "n", + "ie" + ], + "lie": + [ + "l", + "ie" + ], + "zie": + [ + "z", + "ie" + ], + "cie": + [ + "c", + "ie" + ], + "sie": + [ + "s", + "ie" + ], + "zhie": + [ + "zh", + "ie" + ], + "chie": + [ + "ch", + "ie" + ], + "shie": + [ + "sh", + "ie" + ], + "jie": + [ + "j", + "ie" + ], + "qie": + [ + "q", + "ie" + ], + "xie": + [ + "x", + "ie" + ], + "gie": + [ + "g", + "ie" + ], + "kie": + [ + "k", + "ie" + ], + "hie": + [ + "h", + "ie" + ], + "rie": + [ + "r", + "ie" + ], + "ye": + [ + "y", + "ie" + ], + "ie": + [ + "ie" + ], + "ruae": + [ + "r", + "uae" + ], + "wae": + [ + "w", + "uae" + ], + "uae": + [ + "uae" + ], + "bve": + [ + "b", + "ve" + ], + "pve": + [ + "p", + "ve" + ], + "mve": + [ + "m", + "ve" + ], + "fve": + [ + "f", + "ve" + ], + "dve": + [ + "d", + "ve" + ], + "tve": + [ + "t", + "ve" + ], + "nve": + [ + "n", + "ve" + ], + "lve": + [ + "l", + "ve" + ], + "chue": + [ + "ch", + "ve" + ], + "jue": + [ + "j", + "ve" + ], + "que": + [ + "q", + "ve" + ], + "xue": + [ + "x", + "ve" + ], + "gve": + [ + "g", + "ve" + ], + "kve": + [ + "k", + "ve" + ], + "hve": + [ + "h", + "ve" + ], + "rue": + [ + "r", + "ve" + ], + "yue": + [ + "y", + "ve" + ], + "ve": + [ + "ve" + ], + "bai": + [ + "b", + "ai" + ], + "pai": + [ + "p", + "ai" + ], + "mai": + [ + "m", + "ai" + ], + "fai": + [ + "f", + "ai" + ], + "dai": + [ + "d", + "ai" + ], + "tai": + [ + "t", + "ai" + ], + "nai": + [ + "n", + "ai" + ], + "lai": + [ + "l", + "ai" + ], + "zai": + [ + "z", + "ai" + ], + "cai": + [ + "c", + "ai" + ], + "sai": + [ + "s", + "ai" + ], + "zhai": + [ + "zh", + "ai" + ], + "chai": + [ + "ch", + "ai" + ], + "shai": + [ + "sh", + "ai" + ], + "gai": + [ + "g", + "ai" + ], + "kai": + [ + "k", + "ai" + ], + "hai": + [ + "h", + "ai" + ], + "rai": + [ + "r", + "ai" + ], + "ai": + [ + "ai" + ], + "yai": + [ + "y", + "iai" + ], + "iai": + [ + "iai" + ], + "buai": + [ + "b", + "uai" + ], + "puai": + [ + "p", + "uai" + ], + "muai": + [ + "m", + "uai" + ], + "fuai": + [ + "f", + "uai" + ], + "duai": + [ + "d", + "uai" + ], + "tuai": + [ + "t", + "uai" + ], + "nuai": + [ + "n", + "uai" + ], + "luai": + [ + "l", + "uai" + ], + "zuai": + [ + "z", + "uai" + ], + "cuai": + [ + "c", + "uai" + ], + "suai": + [ + "s", + "uai" + ], + "zhuai": + [ + "zh", + "uai" + ], + "chuai": + [ + "ch", + "uai" + ], + "shuai": + [ + "sh", + "uai" + ], + "guai": + [ + "g", + "uai" + ], + "kuai": + [ + "k", + "uai" + ], + "huai": + [ + "h", + "uai" + ], + "ruai": + [ + "r", + "uai" + ], + "wai": + [ + "w", + "uai" + ], + "uai": + [ + "uai" + ], + "bei": + [ + "b", + "ei" + ], + "pei": + [ + "p", + "ei" + ], + "mei": + [ + "m", + "ei" + ], + "fei": + [ + "f", + "ei" + ], + "dei": + [ + "d", + "ei" + ], + "tei": + [ + "t", + "ei" + ], + "nei": + [ + "n", + "ei" + ], + "lei": + [ + "l", + "ei" + ], + "zei": + [ + "z", + "ei" + ], + "cei": + [ + "c", + "ei" + ], + "sei": + [ + "s", + "ei" + ], + "zhei": + [ + "zh", + "ei" + ], + "chei": + [ + "ch", + "ei" + ], + "shei": + [ + "sh", + "ei" + ], + "gei": + [ + "g", + "ei" + ], + "kei": + [ + "k", + "ei" + ], + "hei": + [ + "h", + "ei" + ], + "rei": + [ + "r", + "ei" + ], + "ei": + [ + "ei" + ], + "yei": + [ + "y", + "iei" + ], + "iei": + [ + "iei" + ], + "bui": + [ + "b", + "uei" + ], + "pui": + [ + "p", + "uei" + ], + "mui": + [ + "m", + "uei" + ], + "fui": + [ + "f", + "uei" + ], + "dui": + [ + "d", + "uei" + ], + "tui": + [ + "t", + "uei" + ], + "nui": + [ + "n", + "uei" + ], + "lui": + [ + "l", + "uei" + ], + "zui": + [ + "z", + "uei" + ], + "cui": + [ + "c", + "uei" + ], + "sui": + [ + "s", + "uei" + ], + "zhui": + [ + "zh", + "uei" + ], + "chui": + [ + "ch", + "uei" + ], + "shui": + [ + "sh", + "uei" + ], + "gui": + [ + "g", + "uei" + ], + "kui": + [ + "k", + "uei" + ], + "hui": + [ + "h", + "uei" + ], + "rui": + [ + "r", + "uei" + ], + "wei": + [ + "w", + "uei" + ], + "uei": + [ + "uei" + ], + "bao": + [ + "b", + "ao" + ], + "pao": + [ + "p", + "ao" + ], + "mao": + [ + "m", + "ao" + ], + "fao": + [ + "f", + "ao" + ], + "dao": + [ + "d", + "ao" + ], + "tao": + [ + "t", + "ao" + ], + "nao": + [ + "n", + "ao" + ], + "lao": + [ + "l", + "ao" + ], + "zao": + [ + "z", + "ao" + ], + "cao": + [ + "c", + "ao" + ], + "sao": + [ + "s", + "ao" + ], + "zhao": + [ + "zh", + "ao" + ], + "chao": + [ + "ch", + "ao" + ], + "shao": + [ + "sh", + "ao" + ], + "gao": + [ + "g", + "ao" + ], + "kao": + [ + "k", + "ao" + ], + "hao": + [ + "h", + "ao" + ], + "rao": + [ + "r", + "ao" + ], + "ao": + [ + "ao" + ], + "biao": + [ + "b", + "iao" + ], + "piao": + [ + "p", + "iao" + ], + "miao": + [ + "m", + "iao" + ], + "fiao": + [ + "f", + "iao" + ], + "diao": + [ + "d", + "iao" + ], + "tiao": + [ + "t", + "iao" + ], + "niao": + [ + "n", + "iao" + ], + "liao": + [ + "l", + "iao" + ], + "ziao": + [ + "z", + "iao" + ], + "ciao": + [ + "c", + "iao" + ], + "siao": + [ + "s", + "iao" + ], + "jiao": + [ + "j", + "iao" + ], + "qiao": + [ + "q", + "iao" + ], + "xiao": + [ + "x", + "iao" + ], + "giao": + [ + "g", + "iao" + ], + "kiao": + [ + "k", + "iao" + ], + "hiao": + [ + "h", + "iao" + ], + "yao": + [ + "y", + "iao" + ], + "iao": + [ + "iao" + ], + "wao": + [ + "w", + "uao" + ], + "uao": + [ + "uao" + ], + "bou": + [ + "b", + "ou" + ], + "pou": + [ + "p", + "ou" + ], + "mou": + [ + "m", + "ou" + ], + "fou": + [ + "f", + "ou" + ], + "dou": + [ + "d", + "ou" + ], + "tou": + [ + "t", + "ou" + ], + "nou": + [ + "n", + "ou" + ], + "lou": + [ + "l", + "ou" + ], + "zou": + [ + "z", + "ou" + ], + "cou": + [ + "c", + "ou" + ], + "sou": + [ + "s", + "ou" + ], + "zhou": + [ + "zh", + "ou" + ], + "chou": + [ + "ch", + "ou" + ], + "shou": + [ + "sh", + "ou" + ], + "gou": + [ + "g", + "ou" + ], + "kou": + [ + "k", + "ou" + ], + "hou": + [ + "h", + "ou" + ], + "rou": + [ + "r", + "ou" + ], + "ou": + [ + "ou" + ], + "biu": + [ + "b", + "iou" + ], + "piu": + [ + "p", + "iou" + ], + "miu": + [ + "m", + "iou" + ], + "fiu": + [ + "f", + "iou" + ], + "diu": + [ + "d", + "iou" + ], + "tiu": + [ + "t", + "iou" + ], + "niu": + [ + "n", + "iou" + ], + "liu": + [ + "l", + "iou" + ], + "jiu": + [ + "j", + "iou" + ], + "qiu": + [ + "q", + "iou" + ], + "xiu": + [ + "x", + "iou" + ], + "kiu": + [ + "k", + "iou" + ], + "you": + [ + "y", + "iou" + ], + "iou": + [ + "iou" + ], + "wou": + [ + "w", + "uou" + ], + "uou": + [ + "uou" + ], + "ban": + [ + "b", + "an" + ], + "pan": + [ + "p", + "an" + ], + "man": + [ + "m", + "an" + ], + "fan": + [ + "f", + "an" + ], + "dan": + [ + "d", + "an" + ], + "tan": + [ + "t", + "an" + ], + "nan": + [ + "n", + "an" + ], + "lan": + [ + "l", + "an" + ], + "zan": + [ + "z", + "an" + ], + "can": + [ + "c", + "an" + ], + "san": + [ + "s", + "an" + ], + "zhan": + [ + "zh", + "an" + ], + "chan": + [ + "ch", + "an" + ], + "shan": + [ + "sh", + "an" + ], + "gan": + [ + "g", + "an" + ], + "kan": + [ + "k", + "an" + ], + "han": + [ + "h", + "an" + ], + "ran": + [ + "r", + "an" + ], + "an": + [ + "an" + ], + "buan": + [ + "b", + "uan" + ], + "puan": + [ + "p", + "uan" + ], + "muan": + [ + "m", + "uan" + ], + "fuan": + [ + "f", + "uan" + ], + "duan": + [ + "d", + "uan" + ], + "tuan": + [ + "t", + "uan" + ], + "nuan": + [ + "n", + "uan" + ], + "luan": + [ + "l", + "uan" + ], + "zuan": + [ + "z", + "uan" + ], + "cuan": + [ + "c", + "uan" + ], + "suan": + [ + "s", + "uan" + ], + "zhuan": + [ + "zh", + "uan" + ], + "chuan": + [ + "ch", + "uan" + ], + "shuan": + [ + "sh", + "uan" + ], + "guan": + [ + "g", + "uan" + ], + "kuan": + [ + "k", + "uan" + ], + "huan": + [ + "h", + "uan" + ], + "ruan": + [ + "r", + "uan" + ], + "wan": + [ + "w", + "uan" + ], + "uan": + [ + "uan" + ], + "kaen": + [ + "k", + "aen" + ], + "haen": + [ + "h", + "aen" + ], + "aen": + [ + "aen" + ], + "bian": + [ + "b", + "ian" + ], + "pian": + [ + "p", + "ian" + ], + "mian": + [ + "m", + "ian" + ], + "fian": + [ + "f", + "ian" + ], + "dian": + [ + "d", + "ian" + ], + "tian": + [ + "t", + "ian" + ], + "nian": + [ + "n", + "ian" + ], + "lian": + [ + "l", + "ian" + ], + "zian": + [ + "z", + "ian" + ], + "cian": + [ + "c", + "ian" + ], + "sian": + [ + "s", + "ian" + ], + "zhian": + [ + "zh", + "ian" + ], + "jian": + [ + "j", + "ian" + ], + "qian": + [ + "q", + "ian" + ], + "xian": + [ + "x", + "ian" + ], + "kian": + [ + "k", + "ian" + ], + "yan": + [ + "y", + "ian" + ], + "ian": + [ + "ian" + ], + "juan": + [ + "j", + "van" + ], + "quan": + [ + "q", + "van" + ], + "xuan": + [ + "x", + "van" + ], + "yuan": + [ + "y", + "van" + ], + "van": + [ + "van" + ], + "ben": + [ + "b", + "en" + ], + "pen": + [ + "p", + "en" + ], + "men": + [ + "m", + "en" + ], + "fen": + [ + "f", + "en" + ], + "den": + [ + "d", + "en" + ], + "ten": + [ + "t", + "en" + ], + "nen": + [ + "n", + "en" + ], + "len": + [ + "l", + "en" + ], + "zen": + [ + "z", + "en" + ], + "cen": + [ + "c", + "en" + ], + "sen": + [ + "s", + "en" + ], + "zhen": + [ + "zh", + "en" + ], + "chen": + [ + "ch", + "en" + ], + "shen": + [ + "sh", + "en" + ], + "xen": + [ + "x", + "en" + ], + "gen": + [ + "g", + "en" + ], + "ken": + [ + "k", + "en" + ], + "hen": + [ + "h", + "en" + ], + "ren": + [ + "r", + "en" + ], + "en": + [ + "en" + ], + "bun": + [ + "b", + "uen" + ], + "pun": + [ + "p", + "uen" + ], + "mun": + [ + "m", + "uen" + ], + "fun": + [ + "f", + "uen" + ], + "dun": + [ + "d", + "uen" + ], + "tun": + [ + "t", + "uen" + ], + "nun": + [ + "n", + "uen" + ], + "lun": + [ + "l", + "uen" + ], + "zun": + [ + "z", + "uen" + ], + "cun": + [ + "c", + "uen" + ], + "sun": + [ + "s", + "uen" + ], + "zhun": + [ + "zh", + "uen" + ], + "chun": + [ + "ch", + "uen" + ], + "shun": + [ + "sh", + "uen" + ], + "gun": + [ + "g", + "uen" + ], + "kun": + [ + "k", + "uen" + ], + "hun": + [ + "h", + "uen" + ], + "run": + [ + "r", + "uen" + ], + "wen": + [ + "w", + "uen" + ], + "uen": + [ + "uen" + ], + "bin": + [ + "b", + "in" + ], + "pin": + [ + "p", + "in" + ], + "min": + [ + "m", + "in" + ], + "fin": + [ + "f", + "in" + ], + "din": + [ + "d", + "in" + ], + "tin": + [ + "t", + "in" + ], + "nin": + [ + "n", + "in" + ], + "lin": + [ + "l", + "in" + ], + "zin": + [ + "z", + "in" + ], + "cin": + [ + "c", + "in" + ], + "sin": + [ + "s", + "in" + ], + "jin": + [ + "j", + "in" + ], + "qin": + [ + "q", + "in" + ], + "xin": + [ + "x", + "in" + ], + "yin": + [ + "y", + "in" + ], + "in": + [ + "in" + ], + "jun": + [ + "j", + "vn" + ], + "qun": + [ + "q", + "vn" + ], + "xun": + [ + "x", + "vn" + ], + "yun": + [ + "y", + "vn" + ], + "vn": + [ + "vn" + ], + "bang": + [ + "b", + "ang" + ], + "pang": + [ + "p", + "ang" + ], + "mang": + [ + "m", + "ang" + ], + "fang": + [ + "f", + "ang" + ], + "dang": + [ + "d", + "ang" + ], + "tang": + [ + "t", + "ang" + ], + "nang": + [ + "n", + "ang" + ], + "lang": + [ + "l", + "ang" + ], + "zang": + [ + "z", + "ang" + ], + "cang": + [ + "c", + "ang" + ], + "sang": + [ + "s", + "ang" + ], + "zhang": + [ + "zh", + "ang" + ], + "chang": + [ + "ch", + "ang" + ], + "shang": + [ + "sh", + "ang" + ], + "gang": + [ + "g", + "ang" + ], + "kang": + [ + "k", + "ang" + ], + "hang": + [ + "h", + "ang" + ], + "rang": + [ + "r", + "ang" + ], + "ang": + [ + "ang" + ], + "biang": + [ + "b", + "iang" + ], + "piang": + [ + "p", + "iang" + ], + "miang": + [ + "m", + "iang" + ], + "fiang": + [ + "f", + "iang" + ], + "diang": + [ + "d", + "iang" + ], + "tiang": + [ + "t", + "iang" + ], + "niang": + [ + "n", + "iang" + ], + "liang": + [ + "l", + "iang" + ], + "ziang": + [ + "z", + "iang" + ], + "ciang": + [ + "c", + "iang" + ], + "siang": + [ + "s", + "iang" + ], + "jiang": + [ + "j", + "iang" + ], + "qiang": + [ + "q", + "iang" + ], + "xiang": + [ + "x", + "iang" + ], + "yang": + [ + "y", + "iang" + ], + "iang": + [ + "iang" + ], + "luang": + [ + "l", + "uang" + ], + "zuang": + [ + "z", + "uang" + ], + "cuang": + [ + "c", + "uang" + ], + "suang": + [ + "s", + "uang" + ], + "zhuang": + [ + "zh", + "uang" + ], + "chuang": + [ + "ch", + "uang" + ], + "shuang": + [ + "sh", + "uang" + ], + "guang": + [ + "g", + "uang" + ], + "kuang": + [ + "k", + "uang" + ], + "huang": + [ + "h", + "uang" + ], + "ruang": + [ + "r", + "uang" + ], + "wang": + [ + "w", + "uang" + ], + "uang": + [ + "uang" + ], + "beng": + [ + "b", + "eng" + ], + "peng": + [ + "p", + "eng" + ], + "meng": + [ + "m", + "eng" + ], + "feng": + [ + "f", + "eng" + ], + "deng": + [ + "d", + "eng" + ], + "teng": + [ + "t", + "eng" + ], + "neng": + [ + "n", + "eng" + ], + "leng": + [ + "l", + "eng" + ], + "zeng": + [ + "z", + "eng" + ], + "ceng": + [ + "c", + "eng" + ], + "seng": + [ + "s", + "eng" + ], + "zheng": + [ + "zh", + "eng" + ], + "cheng": + [ + "ch", + "eng" + ], + "sheng": + [ + "sh", + "eng" + ], + "geng": + [ + "g", + "eng" + ], + "keng": + [ + "k", + "eng" + ], + "heng": + [ + "h", + "eng" + ], + "reng": + [ + "r", + "eng" + ], + "eng": + [ + "eng" + ], + "weng": + [ + "w", + "ueng" + ], + "ueng": + [ + "ueng" + ], + "bing": + [ + "b", + "ing" + ], + "ping": + [ + "p", + "ing" + ], + "ming": + [ + "m", + "ing" + ], + "fing": + [ + "f", + "ing" + ], + "ding": + [ + "d", + "ing" + ], + "ting": + [ + "t", + "ing" + ], + "ning": + [ + "n", + "ing" + ], + "ling": + [ + "l", + "ing" + ], + "zing": + [ + "z", + "ing" + ], + "cing": + [ + "c", + "ing" + ], + "sing": + [ + "s", + "ing" + ], + "jing": + [ + "j", + "ing" + ], + "qing": + [ + "q", + "ing" + ], + "xing": + [ + "x", + "ing" + ], + "ging": + [ + "g", + "ing" + ], + "king": + [ + "k", + "ing" + ], + "hing": + [ + "h", + "ing" + ], + "ying": + [ + "y", + "ing" + ], + "ing": + [ + "ing" + ], + "bong": + [ + "b", + "ong" + ], + "pong": + [ + "p", + "ong" + ], + "mong": + [ + "m", + "ong" + ], + "fong": + [ + "f", + "ong" + ], + "dong": + [ + "d", + "ong" + ], + "tong": + [ + "t", + "ong" + ], + "nong": + [ + "n", + "ong" + ], + "long": + [ + "l", + "ong" + ], + "zong": + [ + "z", + "ong" + ], + "cong": + [ + "c", + "ong" + ], + "song": + [ + "s", + "ong" + ], + "zhong": + [ + "zh", + "ong" + ], + "chong": + [ + "ch", + "ong" + ], + "shong": + [ + "sh", + "ong" + ], + "gong": + [ + "g", + "ong" + ], + "kong": + [ + "k", + "ong" + ], + "hong": + [ + "h", + "ong" + ], + "rong": + [ + "r", + "ong" + ], + "ong": + [ + "ong" + ], + "jiong": + [ + "j", + "iong" + ], + "qiong": + [ + "q", + "iong" + ], + "xiong": + [ + "x", + "iong" + ], + "yong": + [ + "y", + "iong" + ], + "iong": + [ + "iong" + ], + "bar": + [ + "b", + "ar" + ], + "par": + [ + "p", + "ar" + ], + "mar": + [ + "m", + "ar" + ], + "far": + [ + "f", + "ar" + ], + "dar": + [ + "d", + "ar" + ], + "tar": + [ + "t", + "ar" + ], + "nar": + [ + "n", + "ar" + ], + "lar": + [ + "l", + "ar" + ], + "zar": + [ + "z", + "ar" + ], + "car": + [ + "c", + "ar" + ], + "sar": + [ + "s", + "ar" + ], + "zhar": + [ + "zh", + "ar" + ], + "char": + [ + "ch", + "ar" + ], + "shar": + [ + "sh", + "ar" + ], + "gar": + [ + "g", + "ar" + ], + "kar": + [ + "k", + "ar" + ], + "har": + [ + "h", + "ar" + ], + "rar": + [ + "r", + "ar" + ], + "ar": + [ + "ar" + ], + "biar": + [ + "b", + "iar" + ], + "piar": + [ + "p", + "iar" + ], + "miar": + [ + "m", + "iar" + ], + "fiar": + [ + "f", + "iar" + ], + "diar": + [ + "d", + "iar" + ], + "tiar": + [ + "t", + "iar" + ], + "niar": + [ + "n", + "iar" + ], + "liar": + [ + "l", + "iar" + ], + "jiar": + [ + "j", + "iar" + ], + "qiar": + [ + "q", + "iar" + ], + "xiar": + [ + "x", + "iar" + ], + "yar": + [ + "y", + "iar" + ], + "iar": + [ + "iar" + ], + "juar": + [ + "j", + "var" + ], + "quar": + [ + "q", + "var" + ], + "xuar": + [ + "x", + "var" + ], + "yuar": + [ + "y", + "var" + ], + "var": + [ + "var" + ], + "duar": + [ + "d", + "uar" + ], + "tuar": + [ + "t", + "uar" + ], + "nuar": + [ + "n", + "uar" + ], + "luar": + [ + "l", + "uar" + ], + "zuar": + [ + "z", + "uar" + ], + "cuar": + [ + "c", + "uar" + ], + "suar": + [ + "s", + "uar" + ], + "zhuar": + [ + "zh", + "uar" + ], + "chuar": + [ + "ch", + "uar" + ], + "shuar": + [ + "sh", + "uar" + ], + "guar": + [ + "g", + "uar" + ], + "kuar": + [ + "k", + "uar" + ], + "huar": + [ + "h", + "uar" + ], + "ruar": + [ + "r", + "uar" + ], + "war": + [ + "w", + "uar" + ], + "uar": + [ + "uar" + ], + "baor": + [ + "b", + "aor" + ], + "paor": + [ + "p", + "aor" + ], + "maor": + [ + "m", + "aor" + ], + "faor": + [ + "f", + "aor" + ], + "daor": + [ + "d", + "aor" + ], + "taor": + [ + "t", + "aor" + ], + "naor": + [ + "n", + "aor" + ], + "laor": + [ + "l", + "aor" + ], + "zaor": + [ + "z", + "aor" + ], + "caor": + [ + "c", + "aor" + ], + "saor": + [ + "s", + "aor" + ], + "zhaor": + [ + "zh", + "aor" + ], + "chaor": + [ + "ch", + "aor" + ], + "shaor": + [ + "sh", + "aor" + ], + "gaor": + [ + "g", + "aor" + ], + "kaor": + [ + "k", + "aor" + ], + "haor": + [ + "h", + "aor" + ], + "raor": + [ + "r", + "aor" + ], + "aor": + [ + "aor" + ], + "biaor": + [ + "b", + "iaor" + ], + "piaor": + [ + "p", + "iaor" + ], + "miaor": + [ + "m", + "iaor" + ], + "fiaor": + [ + "f", + "iaor" + ], + "diaor": + [ + "d", + "iaor" + ], + "tiaor": + [ + "t", + "iaor" + ], + "niaor": + [ + "n", + "iaor" + ], + "liaor": + [ + "l", + "iaor" + ], + "jiaor": + [ + "j", + "iaor" + ], + "qiaor": + [ + "q", + "iaor" + ], + "xiaor": + [ + "x", + "iaor" + ], + "yaor": + [ + "y", + "iaor" + ], + "iaor": + [ + "iaor" + ], + "beer": + [ + "b", + "eer" + ], + "peer": + [ + "p", + "eer" + ], + "meer": + [ + "m", + "eer" + ], + "feer": + [ + "f", + "eer" + ], + "deer": + [ + "d", + "eer" + ], + "teer": + [ + "t", + "eer" + ], + "neer": + [ + "n", + "eer" + ], + "leer": + [ + "l", + "eer" + ], + "zeer": + [ + "z", + "eer" + ], + "ceer": + [ + "c", + "eer" + ], + "seer": + [ + "s", + "eer" + ], + "zheer": + [ + "zh", + "eer" + ], + "cheer": + [ + "ch", + "eer" + ], + "sheer": + [ + "sh", + "eer" + ], + "geer": + [ + "g", + "eer" + ], + "keer": + [ + "k", + "eer" + ], + "heer": + [ + "h", + "eer" + ], + "reer": + [ + "r", + "eer" + ], + "eer": + [ + "eer" + ], + "bier": + [ + "b", + "ier" + ], + "pier": + [ + "p", + "ier" + ], + "mier": + [ + "m", + "ier" + ], + "fier": + [ + "f", + "ier" + ], + "dier": + [ + "d", + "ier" + ], + "tier": + [ + "t", + "ier" + ], + "nier": + [ + "n", + "ier" + ], + "lier": + [ + "l", + "ier" + ], + "jier": + [ + "j", + "ier" + ], + "qier": + [ + "q", + "ier" + ], + "xier": + [ + "x", + "ier" + ], + "yer": + [ + "y", + "ier" + ], + "ier": + [ + "ier" + ], + "nver": + [ + "n", + "ver" + ], + "lver": + [ + "l", + "ver" + ], + "juer": + [ + "j", + "ver" + ], + "quer": + [ + "q", + "ver" + ], + "xuer": + [ + "x", + "ver" + ], + "yuer": + [ + "y", + "ver" + ], + "ver": + [ + "ver" + ], + "bour": + [ + "b", + "our" + ], + "pour": + [ + "p", + "our" + ], + "mour": + [ + "m", + "our" + ], + "four": + [ + "f", + "our" + ], + "dour": + [ + "d", + "our" + ], + "tour": + [ + "t", + "our" + ], + "nour": + [ + "n", + "our" + ], + "lour": + [ + "l", + "our" + ], + "zour": + [ + "z", + "our" + ], + "cour": + [ + "c", + "our" + ], + "sour": + [ + "s", + "our" + ], + "zhour": + [ + "zh", + "our" + ], + "chour": + [ + "ch", + "our" + ], + "shour": + [ + "sh", + "our" + ], + "gour": + [ + "g", + "our" + ], + "kour": + [ + "k", + "our" + ], + "hour": + [ + "h", + "our" + ], + "rour": + [ + "r", + "our" + ], + "our": + [ + "our" + ], + "biur": + [ + "b", + "iour" + ], + "piur": + [ + "p", + "iour" + ], + "miur": + [ + "m", + "iour" + ], + "fiur": + [ + "f", + "iour" + ], + "diur": + [ + "d", + "iour" + ], + "tiur": + [ + "t", + "iour" + ], + "niur": + [ + "n", + "iour" + ], + "liur": + [ + "l", + "iour" + ], + "jiur": + [ + "j", + "iour" + ], + "qiur": + [ + "q", + "iour" + ], + "xiur": + [ + "x", + "iour" + ], + "your": + [ + "y", + "iour" + ], + "iour": + [ + "iour" + ], + "bor": + [ + "b", + "uor" + ], + "por": + [ + "p", + "uor" + ], + "mor": + [ + "m", + "uor" + ], + "for": + [ + "f", + "uor" + ], + "duor": + [ + "d", + "uor" + ], + "tuor": + [ + "t", + "uor" + ], + "nuor": + [ + "n", + "uor" + ], + "luor": + [ + "l", + "uor" + ], + "zuor": + [ + "z", + "uor" + ], + "cuor": + [ + "c", + "uor" + ], + "suor": + [ + "s", + "uor" + ], + "zhuor": + [ + "zh", + "uor" + ], + "chuor": + [ + "ch", + "uor" + ], + "shuor": + [ + "sh", + "uor" + ], + "guor": + [ + "g", + "uor" + ], + "kuor": + [ + "k", + "uor" + ], + "huor": + [ + "h", + "uor" + ], + "ruor": + [ + "r", + "uor" + ], + "wor": + [ + "w", + "uor" + ], + "uor": + [ + "uor" + ], + "ber": + [ + "b", + "er" + ], + "per": + [ + "p", + "er" + ], + "mer": + [ + "m", + "er" + ], + "fer": + [ + "f", + "er" + ], + "der": + [ + "d", + "er" + ], + "ter": + [ + "t", + "er" + ], + "ner": + [ + "n", + "er" + ], + "ler": + [ + "l", + "er" + ], + "zer": + [ + "z", + "er" + ], + "cer": + [ + "c", + "er" + ], + "ser": + [ + "s", + "er" + ], + "zher": + [ + "zh", + "er" + ], + "cher": + [ + "ch", + "er" + ], + "sher": + [ + "sh", + "er" + ], + "ger": + [ + "g", + "er" + ], + "ker": + [ + "k", + "er" + ], + "her": + [ + "h", + "er" + ], + "rer": + [ + "r", + "er" + ], + "er": + [ + "er" + ], + "bir": + [ + "b", + "ir" + ], + "pir": + [ + "p", + "ir" + ], + "mir": + [ + "m", + "ir" + ], + "fir": + [ + "f", + "ir" + ], + "dir": + [ + "d", + "ir" + ], + "tir": + [ + "t", + "ir" + ], + "nir": + [ + "n", + "ir" + ], + "lir": + [ + "l", + "ir" + ], + "jir": + [ + "j", + "ir" + ], + "qir": + [ + "q", + "ir" + ], + "xir": + [ + "x", + "ir" + ], + "yir": + [ + "y", + "ir" + ], + "ir": + [ + "ir" + ], + "nvr": + [ + "n", + "vr" + ], + "lvr": + [ + "l", + "vr" + ], + "jur": + [ + "j", + "vr" + ], + "qur": + [ + "q", + "vr" + ], + "xur": + [ + "x", + "vr" + ], + "yur": + [ + "y", + "vr" + ], + "vr": + [ + "vr" + ], + "duer": + [ + "d", + "uer" + ], + "tuer": + [ + "t", + "uer" + ], + "nuer": + [ + "n", + "uer" + ], + "luer": + [ + "l", + "uer" + ], + "zuer": + [ + "z", + "uer" + ], + "cuer": + [ + "c", + "uer" + ], + "suer": + [ + "s", + "uer" + ], + "zhuer": + [ + "zh", + "uer" + ], + "chuer": + [ + "ch", + "uer" + ], + "shuer": + [ + "sh", + "uer" + ], + "guer": + [ + "g", + "uer" + ], + "kuer": + [ + "k", + "uer" + ], + "huer": + [ + "h", + "uer" + ], + "ruer": + [ + "r", + "uer" + ], + "wer": + [ + "w", + "uer" + ], + "uer": + [ + "uer" + ], + "bur": + [ + "b", + "ur" + ], + "pur": + [ + "p", + "ur" + ], + "mur": + [ + "m", + "ur" + ], + "fur": + [ + "f", + "ur" + ], + "dur": + [ + "d", + "ur" + ], + "tur": + [ + "t", + "ur" + ], + "nur": + [ + "n", + "ur" + ], + "lur": + [ + "l", + "ur" + ], + "zur": + [ + "z", + "ur" + ], + "cur": + [ + "c", + "ur" + ], + "sur": + [ + "s", + "ur" + ], + "zhur": + [ + "zh", + "ur" + ], + "chur": + [ + "ch", + "ur" + ], + "shur": + [ + "sh", + "ur" + ], + "gur": + [ + "g", + "ur" + ], + "kur": + [ + "k", + "ur" + ], + "hur": + [ + "h", + "ur" + ], + "rur": + [ + "r", + "ur" + ], + "wur": + [ + "w", + "ur" + ], + "ur": + [ + "ur" + ], + "bangr": + [ + "b", + "angr" + ], + "pangr": + [ + "p", + "angr" + ], + "mangr": + [ + "m", + "angr" + ], + "fangr": + [ + "f", + "angr" + ], + "dangr": + [ + "d", + "angr" + ], + "tangr": + [ + "t", + "angr" + ], + "nangr": + [ + "n", + "angr" + ], + "langr": + [ + "l", + "angr" + ], + "zangr": + [ + "z", + "angr" + ], + "cangr": + [ + "c", + "angr" + ], + "sangr": + [ + "s", + "angr" + ], + "zhangr": + [ + "zh", + "angr" + ], + "changr": + [ + "ch", + "angr" + ], + "shangr": + [ + "sh", + "angr" + ], + "gangr": + [ + "g", + "angr" + ], + "kangr": + [ + "k", + "angr" + ], + "hangr": + [ + "h", + "angr" + ], + "rangr": + [ + "r", + "angr" + ], + "angr": + [ + "angr" + ], + "biangr": + [ + "b", + "iangr" + ], + "niangr": + [ + "n", + "iangr" + ], + "liangr": + [ + "l", + "iangr" + ], + "jiangr": + [ + "j", + "iangr" + ], + "qiangr": + [ + "q", + "iangr" + ], + "xiangr": + [ + "x", + "iangr" + ], + "yangr": + [ + "y", + "iangr" + ], + "iangr": + [ + "iangr" + ], + "zhuangr": + [ + "zh", + "uangr" + ], + "chuangr": + [ + "ch", + "uangr" + ], + "shuangr": + [ + "sh", + "uangr" + ], + "guangr": + [ + "g", + "uangr" + ], + "kuangr": + [ + "k", + "uangr" + ], + "huangr": + [ + "h", + "uangr" + ], + "ruangr": + [ + "r", + "uangr" + ], + "wangr": + [ + "w", + "uangr" + ], + "uangr": + [ + "uangr" + ], + "bengr": + [ + "b", + "engr" + ], + "pengr": + [ + "p", + "engr" + ], + "mengr": + [ + "m", + "engr" + ], + "fengr": + [ + "f", + "engr" + ], + "dengr": + [ + "d", + "engr" + ], + "tengr": + [ + "t", + "engr" + ], + "nengr": + [ + "n", + "engr" + ], + "lengr": + [ + "l", + "engr" + ], + "zengr": + [ + "z", + "engr" + ], + "cengr": + [ + "c", + "engr" + ], + "sengr": + [ + "s", + "engr" + ], + "zhengr": + [ + "zh", + "engr" + ], + "chengr": + [ + "ch", + "engr" + ], + "shengr": + [ + "sh", + "engr" + ], + "gengr": + [ + "g", + "engr" + ], + "kengr": + [ + "k", + "engr" + ], + "hengr": + [ + "h", + "engr" + ], + "rengr": + [ + "r", + "engr" + ], + "engr": + [ + "engr" + ], + "bingr": + [ + "b", + "ingr" + ], + "pingr": + [ + "p", + "ingr" + ], + "mingr": + [ + "m", + "ingr" + ], + "fingr": + [ + "f", + "ingr" + ], + "dingr": + [ + "d", + "ingr" + ], + "tingr": + [ + "t", + "ingr" + ], + "ningr": + [ + "n", + "ingr" + ], + "lingr": + [ + "l", + "ingr" + ], + "jingr": + [ + "j", + "ingr" + ], + "qingr": + [ + "q", + "ingr" + ], + "xingr": + [ + "x", + "ingr" + ], + "yingr": + [ + "y", + "ingr" + ], + "ingr": + [ + "ingr" + ], + "wengr": + [ + "w", + "uengr" + ], + "uengr": + [ + "uengr" + ], + "dongr": + [ + "d", + "ongr" + ], + "tongr": + [ + "t", + "ongr" + ], + "nongr": + [ + "n", + "ongr" + ], + "longr": + [ + "l", + "ongr" + ], + "zongr": + [ + "z", + "ongr" + ], + "congr": + [ + "c", + "ongr" + ], + "songr": + [ + "s", + "ongr" + ], + "zhongr": + [ + "zh", + "ongr" + ], + "chongr": + [ + "ch", + "ongr" + ], + "shongr": + [ + "sh", + "ongr" + ], + "gongr": + [ + "g", + "ongr" + ], + "kongr": + [ + "k", + "ongr" + ], + "hongr": + [ + "h", + "ongr" + ], + "rongr": + [ + "r", + "ongr" + ], + "ongr": + [ + "ongr" + ], + "jiongr": + [ + "j", + "iongr" + ], + "qiongr": + [ + "q", + "iongr" + ], + "xiongr": + [ + "x", + "iongr" + ], + "yongr": + [ + "y", + "iongr" + ], + "iongr": + [ + "iongr" + ], + "mm": + [ + "mm" + ], + "nn": + [ + "nn" + ], + "ng": + [ + "ng" + ] + }, + "phon_class": + { + "head": + [ + "b", + "z", + "l", + "sh", + "p", + "d", + "m", + "x", + "s", + "y", + "r", + "f", + "n", + "h", + "c", + "j", + "zh", + "ch", + "t", + "g", + "q", + "w", + "k" + ], + "tail": + [ + "ia", + "i", + "ua", + "ei", + "van", + "iang", + "a", + "iong", + "uar", + "angr", + "nn", + "uengr", + "ng", + "ao", + "uo", + "iou", + "en", + "uang", + "er", + "eh", + "ier", + "v", + "ae", + "aor", + "vr", + "ing", + "iao", + "in", + "our", + "uei", + "ir", + "eer", + "iii", + "ong", + "ver", + "iour", + "iai", + "u", + "io", + "ie", + "aen", + "ueng", + "ur", + "engr", + "ingr", + "ongr", + "eng", + "ou", + "uen", + "o", + "uae", + "e", + "uao", + "ar", + "ve", + "iangr", + "ii", + "var", + "iar", + "iongr", + "uer", + "iei", + "iaor", + "uou", + "uan", + "ang", + "ai", + "uor", + "uangr", + "mm", + "an", + "uai", + "ian", + "vn" + ] + } + }, + { + "language": "jp", + "syllable_alias": + { + "n": "nv", + "m": "mv", + "N": "nv", + "M": "mv", + "la": "ra", + "A": "ax", + "I": "ix", + "U": "ux", + "E": "ex", + "O": "ox", + "t": "cl", + "jya": "ja", + "jyu": "ju", + "jye": "je", + "jyo": "jo", + "sya": "sha", + "syu": "shu", + "sye": "she", + "syo": "sho", + "cya": "cha", + "cyu": "chu", + "cye": "che", + "cyo": "cho", + "kA": "kax", + "kE": "kex", + "kO": "kox", + "sA": "sax", + "sI": "six", + "sU": "sux", + "sE": "sex", + "sO": "sox", + "tA": "tax", + "tU": "tux", + "tE": "tex", + "tO": "tox", + "nA": "nax", + "nU": "nux", + "nE": "nex", + "nO": "nox", + "hA": "hax", + "hU": "hux", + "hE": "hex", + "hO": "hox", + "mA": "max", + "mU": "mux", + "mE": "mex", + "mO": "mox", + "yA": "yax", + "yU": "yux", + "yE": "yex", + "yO": "yox", + "rA": "rax", + "rU": "rux", + "rE": "rex", + "rO": "rox", + "rrA": "rrax", + "rrI": "rrix", + "rrU": "rrux", + "rrE": "rrex", + "rrO": "rrox", + "wA": "wax", + "wI": "wix", + "wE": "wex", + "wO": "wox", + "gA": "gax", + "gE": "gex", + "gO": "gox", + "zA": "zax", + "zI": "zix", + "zU": "zux", + "zE": "zex", + "zO": "zox", + "jA": "jax", + "jyA": "jax", + "jI": "jix", + "jU": "jux", + "jyU": "jux", + "jE": "jex", + "jyE": "jex", + "jO": "jox", + "jyO": "jox", + "dA": "dax", + "dU": "dux", + "dE": "dex", + "dO": "dox", + "bA": "bax", + "bU": "bux", + "bE": "bex", + "bO": "box", + "pA": "pax", + "pU": "pux", + "pE": "pex", + "pO": "pox", + "fA": "fax", + "fU": "fux", + "fE": "fex", + "fO": "fox", + "vA": "vax", + "vU": "vux", + "vE": "vex", + "vO": "vox", + "tsA": "tsax", + "tsI": "tsix", + "tsU": "tsux", + "tsE": "tsex", + "tsO": "tsox", + "ngA": "ngax", + "ngU": "ngux", + "ngE": "ngex", + "ngO": "ngox", + "shA": "shax", + "syA": "shax", + "shI": "shix", + "shU": "shux", + "syU": "shux", + "shE": "shex", + "syE": "shex", + "shO": "shox", + "syO": "shox", + "chA": "chax", + "cyA": "chax", + "chI": "chix", + "chU": "chux", + "cyU": "chux", + "chE": "chex", + "cyE": "chex", + "chO": "chox", + "cyO": "chox", + "kyA": "kyax", + "kI": "kix", + "kyU": "kyux", + "kyE": "kyex", + "kyO": "kyox", + "tyA": "tyax", + "tI": "tix", + "tyU": "tyux", + "tyE": "tyex", + "tyO": "tyox", + "myA": "myax", + "mI": "mix", + "myU": "myux", + "myE": "myex", + "myO": "myox", + "nyA": "nyax", + "nI": "nix", + "nyU": "nyux", + "nyE": "nyex", + "nyO": "nyox", + "hyA": "hyax", + "hI": "hix", + "hyU": "hyux", + "hyE": "hyex", + "hyO": "hyox", + "ryA": "ryax", + "rI": "rix", + "ryU": "ryux", + "ryE": "ryex", + "ryO": "ryox", + "gyA": "gyax", + "gI": "gix", + "gyU": "gyux", + "gyE": "gyex", + "gyO": "gyox", + "byA": "byax", + "bI": "bix", + "byU": "byux", + "byE": "byex", + "byO": "byox", + "pyA": "pyax", + "pI": "pix", + "pyU": "pyux", + "pyE": "pyex", + "pyO": "pyox", + "ngyA": "ngyax", + "ngI": "ngix", + "ngyU": "ngyux", + "ngyE": "ngyex", + "ngyO": "ngyox", + "fyA": "fyax", + "fI": "fix", + "fyU": "fyux", + "fyE": "fyex", + "fyO": "fyox", + "vyA": "vyax", + "vI": "vix", + "vyU": "vyux", + "vyE": "vyex", + "vyO": "vyox", + "kwA": "kwax", + "kwI": "kwix", + "kU": "kux", + "kwE": "kwex", + "kwO": "kwox", + "gwA": "gwax", + "gwI": "gwix", + "gU": "gux", + "gwE": "gwex", + "gwO": "gwox" + }, + "dict": + { + "a": + [ + "a" + ], + "i": + [ + "i" + ], + "u": + [ + "u" + ], + "e": + [ + "e" + ], + "o": + [ + "o" + ], + "ax": + [ + "ax" + ], + "ix": + [ + "ix" + ], + "ux": + [ + "ux" + ], + "ex": + [ + "ex" + ], + "ox": + [ + "ox" + ], + "nv": + [ + "nv" + ], + "mv": + [ + "mv" + ], + "cl": + [ + "cl" + ], + "ka": + [ + "k", + "a" + ], + "ke": + [ + "k", + "e" + ], + "ko": + [ + "k", + "o" + ], + "kax": + [ + "k", + "ax" + ], + "kex": + [ + "k", + "ex" + ], + "kox": + [ + "k", + "ox" + ], + "sa": + [ + "s", + "a" + ], + "si": + [ + "s", + "i" + ], + "su": + [ + "s", + "u" + ], + "se": + [ + "s", + "e" + ], + "so": + [ + "s", + "o" + ], + "sax": + [ + "s", + "ax" + ], + "six": + [ + "s", + "ix" + ], + "sux": + [ + "s", + "ux" + ], + "sex": + [ + "s", + "ex" + ], + "sox": + [ + "s", + "ox" + ], + "ta": + [ + "t", + "a" + ], + "tu": + [ + "t", + "u" + ], + "te": + [ + "t", + "e" + ], + "to": + [ + "t", + "o" + ], + "tax": + [ + "t", + "ax" + ], + "tux": + [ + "t", + "ux" + ], + "tex": + [ + "t", + "ex" + ], + "tox": + [ + "t", + "ox" + ], + "na": + [ + "n", + "a" + ], + "nu": + [ + "n", + "u" + ], + "ne": + [ + "n", + "e" + ], + "no": + [ + "n", + "o" + ], + "nax": + [ + "n", + "ax" + ], + "nux": + [ + "n", + "ux" + ], + "nex": + [ + "n", + "ex" + ], + "nox": + [ + "n", + "ox" + ], + "ha": + [ + "h", + "a" + ], + "hu": + [ + "h", + "u" + ], + "he": + [ + "h", + "e" + ], + "ho": + [ + "h", + "o" + ], + "hax": + [ + "h", + "ax" + ], + "hux": + [ + "h", + "ux" + ], + "hex": + [ + "h", + "ex" + ], + "hox": + [ + "h", + "ox" + ], + "ma": + [ + "m", + "a" + ], + "mu": + [ + "m", + "u" + ], + "me": + [ + "m", + "e" + ], + "mo": + [ + "m", + "o" + ], + "max": + [ + "m", + "ax" + ], + "mux": + [ + "m", + "ux" + ], + "mex": + [ + "m", + "ex" + ], + "mox": + [ + "m", + "ox" + ], + "ya": + [ + "y", + "a" + ], + "yu": + [ + "y", + "u" + ], + "ye": + [ + "y", + "e" + ], + "yo": + [ + "y", + "o" + ], + "yax": + [ + "y", + "ax" + ], + "yux": + [ + "y", + "ux" + ], + "yex": + [ + "y", + "ex" + ], + "yox": + [ + "y", + "ox" + ], + "ra": + [ + "r", + "a" + ], + "ru": + [ + "r", + "u" + ], + "re": + [ + "r", + "e" + ], + "ro": + [ + "r", + "o" + ], + "rax": + [ + "r", + "ax" + ], + "rux": + [ + "r", + "ux" + ], + "rex": + [ + "r", + "ex" + ], + "rox": + [ + "r", + "ox" + ], + "rra": + [ + "rr", + "a" + ], + "rri": + [ + "rr", + "i" + ], + "rru": + [ + "rr", + "u" + ], + "rre": + [ + "rr", + "e" + ], + "rro": + [ + "rr", + "o" + ], + "rrax": + [ + "rr", + "ax" + ], + "rrix": + [ + "rr", + "ix" + ], + "rrux": + [ + "rr", + "ux" + ], + "rrex": + [ + "rr", + "ex" + ], + "rrox": + [ + "rr", + "ox" + ], + "wa": + [ + "w", + "a" + ], + "wi": + [ + "w", + "i" + ], + "we": + [ + "w", + "e" + ], + "wo": + [ + "w", + "o" + ], + "wax": + [ + "w", + "ax" + ], + "wix": + [ + "w", + "ix" + ], + "wex": + [ + "w", + "ex" + ], + "wox": + [ + "w", + "ox" + ], + "ga": + [ + "g", + "a" + ], + "ge": + [ + "g", + "e" + ], + "go": + [ + "g", + "o" + ], + "gax": + [ + "g", + "ax" + ], + "gex": + [ + "g", + "ex" + ], + "gox": + [ + "g", + "ox" + ], + "za": + [ + "z", + "a" + ], + "zi": + [ + "z", + "i" + ], + "zu": + [ + "z", + "u" + ], + "ze": + [ + "z", + "e" + ], + "zo": + [ + "z", + "o" + ], + "zax": + [ + "z", + "ax" + ], + "zix": + [ + "z", + "ix" + ], + "zux": + [ + "z", + "ux" + ], + "zex": + [ + "z", + "ex" + ], + "zox": + [ + "z", + "ox" + ], + "ja": + [ + "j", + "a" + ], + "ji": + [ + "j", + "i" + ], + "ju": + [ + "j", + "u" + ], + "je": + [ + "j", + "e" + ], + "jo": + [ + "j", + "o" + ], + "jax": + [ + "j", + "ax" + ], + "jix": + [ + "j", + "ix" + ], + "jux": + [ + "j", + "ux" + ], + "jex": + [ + "j", + "ex" + ], + "jox": + [ + "j", + "ox" + ], + "da": + [ + "d", + "a" + ], + "du": + [ + "d", + "u" + ], + "de": + [ + "d", + "e" + ], + "do": + [ + "d", + "o" + ], + "dax": + [ + "d", + "ax" + ], + "dux": + [ + "d", + "ux" + ], + "dex": + [ + "d", + "ex" + ], + "dox": + [ + "d", + "ox" + ], + "ba": + [ + "b", + "a" + ], + "bu": + [ + "b", + "u" + ], + "be": + [ + "b", + "e" + ], + "bo": + [ + "b", + "o" + ], + "bax": + [ + "b", + "ax" + ], + "bux": + [ + "b", + "ux" + ], + "bex": + [ + "b", + "ex" + ], + "box": + [ + "b", + "ox" + ], + "pa": + [ + "p", + "a" + ], + "pu": + [ + "p", + "u" + ], + "pe": + [ + "p", + "e" + ], + "po": + [ + "p", + "o" + ], + "pax": + [ + "p", + "ax" + ], + "pux": + [ + "p", + "ux" + ], + "pex": + [ + "p", + "ex" + ], + "pox": + [ + "p", + "ox" + ], + "fa": + [ + "f", + "a" + ], + "fu": + [ + "f", + "u" + ], + "fe": + [ + "f", + "e" + ], + "fo": + [ + "f", + "o" + ], + "fax": + [ + "f", + "ax" + ], + "fux": + [ + "f", + "ux" + ], + "fex": + [ + "f", + "ex" + ], + "fox": + [ + "f", + "ox" + ], + "va": + [ + "v", + "a" + ], + "vu": + [ + "v", + "u" + ], + "ve": + [ + "v", + "e" + ], + "vo": + [ + "v", + "o" + ], + "vax": + [ + "v", + "ax" + ], + "vux": + [ + "v", + "ux" + ], + "vex": + [ + "v", + "ex" + ], + "vox": + [ + "v", + "ox" + ], + "tsa": + [ + "ts", + "a" + ], + "tsi": + [ + "ts", + "i" + ], + "tsu": + [ + "ts", + "u" + ], + "tse": + [ + "ts", + "e" + ], + "tso": + [ + "ts", + "o" + ], + "tsax": + [ + "ts", + "ax" + ], + "tsix": + [ + "ts", + "ix" + ], + "tsux": + [ + "ts", + "ux" + ], + "tsex": + [ + "ts", + "ex" + ], + "tsox": + [ + "ts", + "ox" + ], + "nga": + [ + "ng", + "a" + ], + "ngu": + [ + "ng", + "u" + ], + "nge": + [ + "ng", + "e" + ], + "ngo": + [ + "ng", + "o" + ], + "ngax": + [ + "ng", + "ax" + ], + "ngux": + [ + "ng", + "ux" + ], + "ngex": + [ + "ng", + "ex" + ], + "ngox": + [ + "ng", + "ox" + ], + "sha": + [ + "sh", + "a" + ], + "shi": + [ + "sh", + "i" + ], + "shu": + [ + "sh", + "u" + ], + "she": + [ + "sh", + "e" + ], + "sho": + [ + "sh", + "o" + ], + "shax": + [ + "sh", + "ax" + ], + "shix": + [ + "sh", + "ix" + ], + "shux": + [ + "sh", + "ux" + ], + "shex": + [ + "sh", + "ex" + ], + "shox": + [ + "sh", + "ox" + ], + "cha": + [ + "ch", + "a" + ], + "chi": + [ + "ch", + "i" + ], + "chu": + [ + "ch", + "u" + ], + "che": + [ + "ch", + "e" + ], + "cho": + [ + "ch", + "o" + ], + "chax": + [ + "ch", + "ax" + ], + "chix": + [ + "ch", + "ix" + ], + "chux": + [ + "ch", + "ux" + ], + "chex": + [ + "ch", + "ex" + ], + "chox": + [ + "ch", + "ox" + ], + "kya": + [ + "ky", + "a" + ], + "ki": + [ + "ky", + "i" + ], + "kyu": + [ + "ky", + "u" + ], + "kye": + [ + "ky", + "e" + ], + "kyo": + [ + "ky", + "o" + ], + "kyax": + [ + "ky", + "ax" + ], + "kix": + [ + "ky", + "ix" + ], + "kyux": + [ + "ky", + "ux" + ], + "kyex": + [ + "ky", + "ex" + ], + "kyox": + [ + "ky", + "ox" + ], + "tya": + [ + "ty", + "a" + ], + "ti": + [ + "ty", + "i" + ], + "tyu": + [ + "ty", + "u" + ], + "tye": + [ + "ty", + "e" + ], + "tyo": + [ + "ty", + "o" + ], + "tyax": + [ + "ty", + "ax" + ], + "tix": + [ + "ty", + "ix" + ], + "tyux": + [ + "ty", + "ux" + ], + "tyex": + [ + "ty", + "ex" + ], + "tyox": + [ + "ty", + "ox" + ], + "mya": + [ + "my", + "a" + ], + "mi": + [ + "my", + "i" + ], + "myu": + [ + "my", + "u" + ], + "mye": + [ + "my", + "e" + ], + "myo": + [ + "my", + "o" + ], + "myax": + [ + "my", + "ax" + ], + "mix": + [ + "my", + "ix" + ], + "myux": + [ + "my", + "ux" + ], + "myex": + [ + "my", + "ex" + ], + "myox": + [ + "my", + "ox" + ], + "nya": + [ + "ny", + "a" + ], + "ni": + [ + "ny", + "i" + ], + "nyu": + [ + "ny", + "u" + ], + "nye": + [ + "ny", + "e" + ], + "nyo": + [ + "ny", + "o" + ], + "nyax": + [ + "ny", + "ax" + ], + "nix": + [ + "ny", + "ix" + ], + "nyux": + [ + "ny", + "ux" + ], + "nyex": + [ + "ny", + "ex" + ], + "nyox": + [ + "ny", + "ox" + ], + "hya": + [ + "hy", + "a" + ], + "hi": + [ + "hy", + "i" + ], + "hyu": + [ + "hy", + "u" + ], + "hye": + [ + "hy", + "e" + ], + "hyo": + [ + "hy", + "o" + ], + "hyax": + [ + "hy", + "ax" + ], + "hix": + [ + "hy", + "ix" + ], + "hyux": + [ + "hy", + "ux" + ], + "hyex": + [ + "hy", + "ex" + ], + "hyox": + [ + "hy", + "ox" + ], + "rya": + [ + "ry", + "a" + ], + "ri": + [ + "ry", + "i" + ], + "ryu": + [ + "ry", + "u" + ], + "rye": + [ + "ry", + "e" + ], + "ryo": + [ + "ry", + "o" + ], + "ryax": + [ + "ry", + "ax" + ], + "rix": + [ + "ry", + "ix" + ], + "ryux": + [ + "ry", + "ux" + ], + "ryex": + [ + "ry", + "ex" + ], + "ryox": + [ + "ry", + "ox" + ], + "gya": + [ + "gy", + "a" + ], + "gi": + [ + "gy", + "i" + ], + "gyu": + [ + "gy", + "u" + ], + "gye": + [ + "gy", + "e" + ], + "gyo": + [ + "gy", + "o" + ], + "gyax": + [ + "gy", + "ax" + ], + "gix": + [ + "gy", + "ix" + ], + "gyux": + [ + "gy", + "ux" + ], + "gyex": + [ + "gy", + "ex" + ], + "gyox": + [ + "gy", + "ox" + ], + "bya": + [ + "by", + "a" + ], + "bi": + [ + "by", + "i" + ], + "byu": + [ + "by", + "u" + ], + "bye": + [ + "by", + "e" + ], + "byo": + [ + "by", + "o" + ], + "byax": + [ + "by", + "ax" + ], + "bix": + [ + "by", + "ix" + ], + "byux": + [ + "by", + "ux" + ], + "byex": + [ + "by", + "ex" + ], + "byox": + [ + "by", + "ox" + ], + "pya": + [ + "py", + "a" + ], + "pi": + [ + "py", + "i" + ], + "pyu": + [ + "py", + "u" + ], + "pye": + [ + "py", + "e" + ], + "pyo": + [ + "py", + "o" + ], + "pyax": + [ + "py", + "ax" + ], + "pix": + [ + "py", + "ix" + ], + "pyux": + [ + "py", + "ux" + ], + "pyex": + [ + "py", + "ex" + ], + "pyox": + [ + "py", + "ox" + ], + "dya": + [ + "dy", + "a" + ], + "di": + [ + "dy", + "i" + ], + "dyu": + [ + "dy", + "u" + ], + "dye": + [ + "dy", + "e" + ], + "dyo": + [ + "dy", + "o" + ], + "dyax": + [ + "dy", + "ax" + ], + "dix": + [ + "dy", + "ix" + ], + "dyux": + [ + "dy", + "ux" + ], + "dyex": + [ + "dy", + "ex" + ], + "dyox": + [ + "dy", + "ox" + ], + "ngya": + [ + "ngy", + "a" + ], + "ngi": + [ + "ngy", + "i" + ], + "ngyu": + [ + "ngy", + "u" + ], + "ngye": + [ + "ngy", + "e" + ], + "ngyo": + [ + "ngy", + "o" + ], + "ngyax": + [ + "ngy", + "ax" + ], + "ngix": + [ + "ngy", + "ix" + ], + "ngyux": + [ + "ngy", + "ux" + ], + "ngyex": + [ + "ngy", + "ex" + ], + "ngyox": + [ + "ngy", + "ox" + ], + "fya": + [ + "fy", + "a" + ], + "fi": + [ + "fy", + "i" + ], + "fyu": + [ + "fy", + "u" + ], + "fye": + [ + "fy", + "e" + ], + "fyo": + [ + "fy", + "o" + ], + "fyax": + [ + "fy", + "ax" + ], + "fix": + [ + "fy", + "ix" + ], + "fyux": + [ + "fy", + "ux" + ], + "fyex": + [ + "fy", + "ex" + ], + "fyox": + [ + "fy", + "ox" + ], + "vya": + [ + "vy", + "a" + ], + "vi": + [ + "vy", + "i" + ], + "vyu": + [ + "vy", + "u" + ], + "vye": + [ + "vy", + "e" + ], + "vyo": + [ + "vy", + "o" + ], + "vyax": + [ + "vy", + "ax" + ], + "vix": + [ + "vy", + "ix" + ], + "vyux": + [ + "vy", + "ux" + ], + "vyex": + [ + "vy", + "ex" + ], + "vyox": + [ + "vy", + "ox" + ], + "kwa": + [ + "kw", + "a" + ], + "kwi": + [ + "kw", + "i" + ], + "ku": + [ + "kw", + "u" + ], + "kwe": + [ + "kw", + "e" + ], + "kwo": + [ + "kw", + "o" + ], + "kwax": + [ + "kw", + "ax" + ], + "kwix": + [ + "kw", + "ix" + ], + "kux": + [ + "kw", + "ux" + ], + "kwex": + [ + "kw", + "ex" + ], + "kwox": + [ + "kw", + "ox" + ], + "gwa": + [ + "gw", + "a" + ], + "gwi": + [ + "gw", + "i" + ], + "gu": + [ + "gw", + "u" + ], + "gwe": + [ + "gw", + "e" + ], + "gwo": + [ + "gw", + "o" + ], + "gwax": + [ + "gw", + "ax" + ], + "gwix": + [ + "gw", + "ix" + ], + "gux": + [ + "gw", + "ux" + ], + "gwex": + [ + "gw", + "ex" + ], + "gwox": + [ + "gw", + "ox" + ] + }, + "phon_class": + { + "head": + [ + "k", + "s", + "sh", + "t", + "ch", + "ts", + "n", + "h", + "f", + "m", + "y", + "r", + "rr", + "w", + "g", + "z", + "j", + "d", + "b", + "p", + "ky", + "ty", + "ny", + "hy", + "my", + "ry", + "by", + "gy", + "py", + "dy", + "vy", + "fy", + "kw", + "gw", + "ng", + "ngy", + "v" + ], + "tail": + [ + "a", + "i", + "u", + "e", + "o", + "nv", + "mv", + "ax", + "ix", + "ux", + "ex", + "ox", + "cl" + ] + } + }, + { + "language": "eng", + "dict": + {}, + "phon_class": + { + "tail": + [ + "aa", + "ae", + "ah", + "ao", + "aw", + "ay", + "eh", + "er", + "ey", + "ih", + "iy", + "ow", + "oy", + "uh", + "uw", + "mv", + "nv", + "ngv" + ], + "head": + [ + "b", + "ch", + "d", + "dh", + "f", + "g", + "hh", + "jh", + "k", + "l", + "m", + "n", + "ng", + "p", + "r", + "s", + "sh", + "t", + "th", + "v", + "w", + "y", + "z", + "zh", + "dx", + "dr", + "tr" + ] + } + } + ] +} \ No newline at end of file diff --git a/resource/midi-note.scp b/resources/midi-note.scp similarity index 100% rename from resource/midi-note.scp rename to resources/midi-note.scp diff --git a/resources/pinyin_dict.py b/resources/pinyin_dict.py new file mode 100755 index 0000000000000000000000000000000000000000..535e11160d5d28a92be7b508ddb01ab5f6b008d1 --- /dev/null +++ b/resources/pinyin_dict.py @@ -0,0 +1,423 @@ +# Adapted from Opencpop's pinyin to phoneme mapping table: +# https://wenet.org.cn/opencpop/resources/annotationformat/ +PINYIN_DICT = { + "a": ("a",), + "ai": ("ai",), + "an": ("an",), + "ang": ("ang",), + "ao": ("ao",), + "ba": ("b", "a"), + "bai": ("b", "ai"), + "ban": ("b", "an"), + "bang": ("b", "ang"), + "bao": ("b", "ao"), + "bei": ("b", "ei"), + "ben": ("b", "en"), + "beng": ("b", "eng"), + "bi": ("b", "i"), + "bian": ("b", "ian"), + "biao": ("b", "iao"), + "bie": ("b", "ie"), + "bin": ("b", "in"), + "bing": ("b", "ing"), + "bo": ("b", "o"), + "bu": ("b", "u"), + "ca": ("c", "a"), + "cai": ("c", "ai"), + "can": ("c", "an"), + "cang": ("c", "ang"), + "cao": ("c", "ao"), + "ce": ("c", "e"), + "cei": ("c", "ei"), + "cen": ("c", "en"), + "ceng": ("c", "eng"), + "cha": ("ch", "a"), + "chai": ("ch", "ai"), + "chan": ("ch", "an"), + "chang": ("ch", "ang"), + "chao": ("ch", "ao"), + "che": ("ch", "e"), + "chen": ("ch", "en"), + "cheng": ("ch", "eng"), + "chi": ("ch", "i"), + "chong": ("ch", "ong"), + "chou": ("ch", "ou"), + "chu": ("ch", "u"), + "chua": ("ch", "ua"), + "chuai": ("ch", "uai"), + "chuan": ("ch", "uan"), + "chuang": ("ch", "uang"), + "chui": ("ch", "ui"), + "chun": ("ch", "un"), + "chuo": ("ch", "uo"), + "ci": ("c", "i"), + "cong": ("c", "ong"), + "cou": ("c", "ou"), + "cu": ("c", "u"), + "cuan": ("c", "uan"), + "cui": ("c", "ui"), + "cun": ("c", "un"), + "cuo": ("c", "uo"), + "da": ("d", "a"), + "dai": ("d", "ai"), + "dan": ("d", "an"), + "dang": ("d", "ang"), + "dao": ("d", "ao"), + "de": ("d", "e"), + "dei": ("d", "ei"), + "den": ("d", "en"), + "deng": ("d", "eng"), + "di": ("d", "i"), + "dia": ("d", "ia"), + "dian": ("d", "ian"), + "diao": ("d", "iao"), + "die": ("d", "ie"), + "ding": ("d", "ing"), + "diu": ("d", "iu"), + "dong": ("d", "ong"), + "dou": ("d", "ou"), + "du": ("d", "u"), + "duan": ("d", "uan"), + "dui": ("d", "ui"), + "dun": ("d", "un"), + "duo": ("d", "uo"), + "e": ("e",), + "ei": ("ei",), + "en": ("en",), + "eng": ("eng",), + "er": ("er",), + "fa": ("f", "a"), + "fan": ("f", "an"), + "fang": ("f", "ang"), + "fei": ("f", "ei"), + "fen": ("f", "en"), + "feng": ("f", "eng"), + "fo": ("f", "o"), + "fou": ("f", "ou"), + "fu": ("f", "u"), + "ga": ("g", "a"), + "gai": ("g", "ai"), + "gan": ("g", "an"), + "gang": ("g", "ang"), + "gao": ("g", "ao"), + "ge": ("g", "e"), + "gei": ("g", "ei"), + "gen": ("g", "en"), + "geng": ("g", "eng"), + "gong": ("g", "ong"), + "gou": ("g", "ou"), + "gu": ("g", "u"), + "gua": ("g", "ua"), + "guai": ("g", "uai"), + "guan": ("g", "uan"), + "guang": ("g", "uang"), + "gui": ("g", "ui"), + "gun": ("g", "un"), + "guo": ("g", "uo"), + "ha": ("h", "a"), + "hai": ("h", "ai"), + "han": ("h", "an"), + "hang": ("h", "ang"), + "hao": ("h", "ao"), + "he": ("h", "e"), + "hei": ("h", "ei"), + "hen": ("h", "en"), + "heng": ("h", "eng"), + "hm": ("h", "m"), + "hng": ("h", "ng"), + "hong": ("h", "ong"), + "hou": ("h", "ou"), + "hu": ("h", "u"), + "hua": ("h", "ua"), + "huai": ("h", "uai"), + "huan": ("h", "uan"), + "huang": ("h", "uang"), + "hui": ("h", "ui"), + "hun": ("h", "un"), + "huo": ("h", "uo"), + "ji": ("j", "i"), + "jia": ("j", "ia"), + "jian": ("j", "ian"), + "jiang": ("j", "iang"), + "jiao": ("j", "iao"), + "jie": ("j", "ie"), + "jin": ("j", "in"), + "jing": ("j", "ing"), + "jiong": ("j", "iong"), + "jiu": ("j", "iu"), + "ju": ("j", "v"), + "juan": ("j", "van"), + "jue": ("j", "ve"), + "jun": ("j", "vn"), + "ka": ("k", "a"), + "kai": ("k", "ai"), + "kan": ("k", "an"), + "kang": ("k", "ang"), + "kao": ("k", "ao"), + "ke": ("k", "e"), + "kei": ("k", "ei"), + "ken": ("k", "en"), + "keng": ("k", "eng"), + "kong": ("k", "ong"), + "kou": ("k", "ou"), + "ku": ("k", "u"), + "kua": ("k", "ua"), + "kuai": ("k", "uai"), + "kuan": ("k", "uan"), + "kuang": ("k", "uang"), + "kui": ("k", "ui"), + "kun": ("k", "un"), + "kuo": ("k", "uo"), + "la": ("l", "a"), + "lai": ("l", "ai"), + "lan": ("l", "an"), + "lang": ("l", "ang"), + "lao": ("l", "ao"), + "le": ("l", "e"), + "lei": ("l", "ei"), + "leng": ("l", "eng"), + "li": ("l", "i"), + "lia": ("l", "ia"), + "lian": ("l", "ian"), + "liang": ("l", "iang"), + "liao": ("l", "iao"), + "lie": ("l", "ie"), + "lin": ("l", "in"), + "ling": ("l", "ing"), + "liu": ("l", "iu"), + "lo": ("l", "o"), + "long": ("l", "ong"), + "lou": ("l", "ou"), + "lu": ("l", "u"), + "luan": ("l", "uan"), + "lun": ("l", "un"), + "luo": ("l", "uo"), + "lv": ("l", "v"), + "lve": ("l", "ve"), + "m": ("m",), + "ma": ("m", "a"), + "mai": ("m", "ai"), + "man": ("m", "an"), + "mang": ("m", "ang"), + "mao": ("m", "ao"), + "me": ("m", "e"), + "mei": ("m", "ei"), + "men": ("m", "en"), + "meng": ("m", "eng"), + "mi": ("m", "i"), + "mian": ("m", "ian"), + "miao": ("m", "iao"), + "mie": ("m", "ie"), + "min": ("m", "in"), + "ming": ("m", "ing"), + "miu": ("m", "iu"), + "mo": ("m", "o"), + "mou": ("m", "ou"), + "mu": ("m", "u"), + "n": ("n",), + "na": ("n", "a"), + "nai": ("n", "ai"), + "nan": ("n", "an"), + "nang": ("n", "ang"), + "nao": ("n", "ao"), + "ne": ("n", "e"), + "nei": ("n", "ei"), + "nen": ("n", "en"), + "neng": ("n", "eng"), + "ng": ("n", "g"), + "ni": ("n", "i"), + "nian": ("n", "ian"), + "niang": ("n", "iang"), + "niao": ("n", "iao"), + "nie": ("n", "ie"), + "nin": ("n", "in"), + "ning": ("n", "ing"), + "niu": ("n", "iu"), + "nong": ("n", "ong"), + "nou": ("n", "ou"), + "nu": ("n", "u"), + "nuan": ("n", "uan"), + "nun": ("n", "un"), + "nuo": ("n", "uo"), + "nv": ("n", "v"), + "nve": ("n", "ve"), + "o": ("o",), + "ou": ("ou",), + "pa": ("p", "a"), + "pai": ("p", "ai"), + "pan": ("p", "an"), + "pang": ("p", "ang"), + "pao": ("p", "ao"), + "pei": ("p", "ei"), + "pen": ("p", "en"), + "peng": ("p", "eng"), + "pi": ("p", "i"), + "pian": ("p", "ian"), + "piao": ("p", "iao"), + "pie": ("p", "ie"), + "pin": ("p", "in"), + "ping": ("p", "ing"), + "po": ("p", "o"), + "pou": ("p", "ou"), + "pu": ("p", "u"), + "qi": ("q", "i"), + "qia": ("q", "ia"), + "qian": ("q", "ian"), + "qiang": ("q", "iang"), + "qiao": ("q", "iao"), + "qie": ("q", "ie"), + "qin": ("q", "in"), + "qing": ("q", "ing"), + "qiong": ("q", "iong"), + "qiu": ("q", "iu"), + "qu": ("q", "v"), + "quan": ("q", "van"), + "que": ("q", "ve"), + "qun": ("q", "vn"), + "ran": ("r", "an"), + "rang": ("r", "ang"), + "rao": ("r", "ao"), + "re": ("r", "e"), + "ren": ("r", "en"), + "reng": ("r", "eng"), + "ri": ("r", "i"), + "rong": ("r", "ong"), + "rou": ("r", "ou"), + "ru": ("r", "u"), + "rua": ("r", "ua"), + "ruan": ("r", "uan"), + "rui": ("r", "ui"), + "run": ("r", "un"), + "ruo": ("r", "uo"), + "sa": ("s", "a"), + "sai": ("s", "ai"), + "san": ("s", "an"), + "sang": ("s", "ang"), + "sao": ("s", "ao"), + "se": ("s", "e"), + "sen": ("s", "en"), + "seng": ("s", "eng"), + "sha": ("sh", "a"), + "shai": ("sh", "ai"), + "shan": ("sh", "an"), + "shang": ("sh", "ang"), + "shao": ("sh", "ao"), + "she": ("sh", "e"), + "shei": ("sh", "ei"), + "shen": ("sh", "en"), + "sheng": ("sh", "eng"), + "shi": ("sh", "i"), + "shou": ("sh", "ou"), + "shu": ("sh", "u"), + "shua": ("sh", "ua"), + "shuai": ("sh", "uai"), + "shuan": ("sh", "uan"), + "shuang": ("sh", "uang"), + "shui": ("sh", "ui"), + "shun": ("sh", "un"), + "shuo": ("sh", "uo"), + "si": ("s", "i"), + "song": ("s", "ong"), + "sou": ("s", "ou"), + "su": ("s", "u"), + "suan": ("s", "uan"), + "sui": ("s", "ui"), + "sun": ("s", "un"), + "suo": ("s", "uo"), + "ta": ("t", "a"), + "tai": ("t", "ai"), + "tan": ("t", "an"), + "tang": ("t", "ang"), + "tao": ("t", "ao"), + "te": ("t", "e"), + "tei": ("t", "ei"), + "teng": ("t", "eng"), + "ti": ("t", "i"), + "tian": ("t", "ian"), + "tiao": ("t", "iao"), + "tie": ("t", "ie"), + "ting": ("t", "ing"), + "tong": ("t", "ong"), + "tou": ("t", "ou"), + "tu": ("t", "u"), + "tuan": ("t", "uan"), + "tui": ("t", "ui"), + "tun": ("t", "un"), + "tuo": ("t", "uo"), + "wa": ("w", "a"), + "wai": ("w", "ai"), + "wan": ("w", "an"), + "wang": ("w", "ang"), + "wei": ("w", "ei"), + "wen": ("w", "en"), + "weng": ("w", "eng"), + "wo": ("w", "o"), + "wu": ("w", "u"), + "xi": ("x", "i"), + "xia": ("x", "ia"), + "xian": ("x", "ian"), + "xiang": ("x", "iang"), + "xiao": ("x", "iao"), + "xie": ("x", "ie"), + "xin": ("x", "in"), + "xing": ("x", "ing"), + "xiong": ("x", "iong"), + "xiu": ("x", "iu"), + "xu": ("x", "v"), + "xuan": ("x", "van"), + "xue": ("x", "ve"), + "xun": ("x", "vn"), + "ya": ("y", "a"), + "yan": ("y", "an"), + "yang": ("y", "ang"), + "yao": ("y", "ao"), + "ye": ("y", "e"), + "yi": ("y", "i"), + "yin": ("y", "in"), + "ying": ("y", "ing"), + "yo": ("y", "o"), + "yong": ("y", "ong"), + "you": ("y", "ou"), + "yu": ("y", "v"), + "yuan": ("y", "van"), + "yue": ("y", "ve"), + "yun": ("y", "vn"), + "za": ("z", "a"), + "zai": ("z", "ai"), + "zan": ("z", "an"), + "zang": ("z", "ang"), + "zao": ("z", "ao"), + "ze": ("z", "e"), + "zei": ("z", "ei"), + "zen": ("z", "en"), + "zeng": ("z", "eng"), + "zha": ("zh", "a"), + "zhai": ("zh", "ai"), + "zhan": ("zh", "an"), + "zhang": ("zh", "ang"), + "zhao": ("zh", "ao"), + "zhe": ("zh", "e"), + "zhei": ("zh", "ei"), + "zhen": ("zh", "en"), + "zheng": ("zh", "eng"), + "zhi": ("zh", "i"), + "zhong": ("zh", "ong"), + "zhou": ("zh", "ou"), + "zhu": ("zh", "u"), + "zhua": ("zh", "ua"), + "zhuai": ("zh", "uai"), + "zhuan": ("zh", "uan"), + "zhuang": ("zh", "uang"), + "zhui": ("zh", "ui"), + "zhun": ("zh", "un"), + "zhuo": ("zh", "uo"), + "zi": ("z", "i"), + "zong": ("z", "ong"), + "zou": ("z", "ou"), + "zu": ("z", "u"), + "zuan": ("z", "uan"), + "zui": ("z", "ui"), + "zun": ("z", "un"), + "zuo": ("z", "uo"), +} + diff --git a/resource/singer/singer_embedding_ace-1.npy b/resources/singer/singer_embedding_ace-1.npy similarity index 100% rename from resource/singer/singer_embedding_ace-1.npy rename to resources/singer/singer_embedding_ace-1.npy diff --git a/resource/singer/singer_embedding_ace-10.npy b/resources/singer/singer_embedding_ace-10.npy similarity index 100% rename from resource/singer/singer_embedding_ace-10.npy rename to resources/singer/singer_embedding_ace-10.npy diff --git a/resource/singer/singer_embedding_ace-11.npy b/resources/singer/singer_embedding_ace-11.npy similarity index 100% rename from resource/singer/singer_embedding_ace-11.npy rename to resources/singer/singer_embedding_ace-11.npy diff --git a/resource/singer/singer_embedding_ace-12.npy b/resources/singer/singer_embedding_ace-12.npy similarity index 100% rename from resource/singer/singer_embedding_ace-12.npy rename to resources/singer/singer_embedding_ace-12.npy diff --git a/resource/singer/singer_embedding_ace-13.npy b/resources/singer/singer_embedding_ace-13.npy similarity index 100% rename from resource/singer/singer_embedding_ace-13.npy rename to resources/singer/singer_embedding_ace-13.npy diff --git a/resource/singer/singer_embedding_ace-14.npy b/resources/singer/singer_embedding_ace-14.npy similarity index 100% rename from resource/singer/singer_embedding_ace-14.npy rename to resources/singer/singer_embedding_ace-14.npy diff --git a/resource/singer/singer_embedding_ace-15.npy b/resources/singer/singer_embedding_ace-15.npy similarity index 100% rename from resource/singer/singer_embedding_ace-15.npy rename to resources/singer/singer_embedding_ace-15.npy diff --git a/resource/singer/singer_embedding_ace-16.npy b/resources/singer/singer_embedding_ace-16.npy similarity index 100% rename from resource/singer/singer_embedding_ace-16.npy rename to resources/singer/singer_embedding_ace-16.npy diff --git a/resource/singer/singer_embedding_ace-17.npy b/resources/singer/singer_embedding_ace-17.npy similarity index 100% rename from resource/singer/singer_embedding_ace-17.npy rename to resources/singer/singer_embedding_ace-17.npy diff --git a/resource/singer/singer_embedding_ace-18.npy b/resources/singer/singer_embedding_ace-18.npy similarity index 100% rename from resource/singer/singer_embedding_ace-18.npy rename to resources/singer/singer_embedding_ace-18.npy diff --git a/resource/singer/singer_embedding_ace-19.npy b/resources/singer/singer_embedding_ace-19.npy similarity index 100% rename from resource/singer/singer_embedding_ace-19.npy rename to resources/singer/singer_embedding_ace-19.npy diff --git a/resource/singer/singer_embedding_ace-2.npy b/resources/singer/singer_embedding_ace-2.npy similarity index 100% rename from resource/singer/singer_embedding_ace-2.npy rename to resources/singer/singer_embedding_ace-2.npy diff --git a/resource/singer/singer_embedding_ace-20.npy b/resources/singer/singer_embedding_ace-20.npy similarity index 100% rename from resource/singer/singer_embedding_ace-20.npy rename to resources/singer/singer_embedding_ace-20.npy diff --git a/resource/singer/singer_embedding_ace-21.npy b/resources/singer/singer_embedding_ace-21.npy similarity index 100% rename from resource/singer/singer_embedding_ace-21.npy rename to resources/singer/singer_embedding_ace-21.npy diff --git a/resource/singer/singer_embedding_ace-22.npy b/resources/singer/singer_embedding_ace-22.npy similarity index 100% rename from resource/singer/singer_embedding_ace-22.npy rename to resources/singer/singer_embedding_ace-22.npy diff --git a/resource/singer/singer_embedding_ace-23.npy b/resources/singer/singer_embedding_ace-23.npy similarity index 100% rename from resource/singer/singer_embedding_ace-23.npy rename to resources/singer/singer_embedding_ace-23.npy diff --git a/resource/singer/singer_embedding_ace-24.npy b/resources/singer/singer_embedding_ace-24.npy similarity index 100% rename from resource/singer/singer_embedding_ace-24.npy rename to resources/singer/singer_embedding_ace-24.npy diff --git a/resource/singer/singer_embedding_ace-25.npy b/resources/singer/singer_embedding_ace-25.npy similarity index 100% rename from resource/singer/singer_embedding_ace-25.npy rename to resources/singer/singer_embedding_ace-25.npy diff --git a/resource/singer/singer_embedding_ace-26.npy b/resources/singer/singer_embedding_ace-26.npy similarity index 100% rename from resource/singer/singer_embedding_ace-26.npy rename to resources/singer/singer_embedding_ace-26.npy diff --git a/resource/singer/singer_embedding_ace-27.npy b/resources/singer/singer_embedding_ace-27.npy similarity index 100% rename from resource/singer/singer_embedding_ace-27.npy rename to resources/singer/singer_embedding_ace-27.npy diff --git a/resource/singer/singer_embedding_ace-28.npy b/resources/singer/singer_embedding_ace-28.npy similarity index 100% rename from resource/singer/singer_embedding_ace-28.npy rename to resources/singer/singer_embedding_ace-28.npy diff --git a/resource/singer/singer_embedding_ace-29.npy b/resources/singer/singer_embedding_ace-29.npy similarity index 100% rename from resource/singer/singer_embedding_ace-29.npy rename to resources/singer/singer_embedding_ace-29.npy diff --git a/resource/singer/singer_embedding_ace-3.npy b/resources/singer/singer_embedding_ace-3.npy similarity index 100% rename from resource/singer/singer_embedding_ace-3.npy rename to resources/singer/singer_embedding_ace-3.npy diff --git a/resource/singer/singer_embedding_ace-30.npy b/resources/singer/singer_embedding_ace-30.npy similarity index 100% rename from resource/singer/singer_embedding_ace-30.npy rename to resources/singer/singer_embedding_ace-30.npy diff --git a/resource/singer/singer_embedding_ace-4.npy b/resources/singer/singer_embedding_ace-4.npy similarity index 100% rename from resource/singer/singer_embedding_ace-4.npy rename to resources/singer/singer_embedding_ace-4.npy diff --git a/resource/singer/singer_embedding_ace-5.npy b/resources/singer/singer_embedding_ace-5.npy similarity index 100% rename from resource/singer/singer_embedding_ace-5.npy rename to resources/singer/singer_embedding_ace-5.npy diff --git a/resource/singer/singer_embedding_ace-6.npy b/resources/singer/singer_embedding_ace-6.npy similarity index 100% rename from resource/singer/singer_embedding_ace-6.npy rename to resources/singer/singer_embedding_ace-6.npy diff --git a/resource/singer/singer_embedding_ace-7.npy b/resources/singer/singer_embedding_ace-7.npy similarity index 100% rename from resource/singer/singer_embedding_ace-7.npy rename to resources/singer/singer_embedding_ace-7.npy diff --git a/resource/singer/singer_embedding_ace-8.npy b/resources/singer/singer_embedding_ace-8.npy similarity index 100% rename from resource/singer/singer_embedding_ace-8.npy rename to resources/singer/singer_embedding_ace-8.npy diff --git a/resource/singer/singer_embedding_ace-9.npy b/resources/singer/singer_embedding_ace-9.npy similarity index 100% rename from resource/singer/singer_embedding_ace-9.npy rename to resources/singer/singer_embedding_ace-9.npy diff --git a/resource/singer/singer_embedding_ameboshi.npy b/resources/singer/singer_embedding_ameboshi.npy similarity index 100% rename from resource/singer/singer_embedding_ameboshi.npy rename to resources/singer/singer_embedding_ameboshi.npy diff --git a/resource/singer/singer_embedding_itako.npy b/resources/singer/singer_embedding_itako.npy similarity index 100% rename from resource/singer/singer_embedding_itako.npy rename to resources/singer/singer_embedding_itako.npy diff --git a/resource/singer/singer_embedding_kiritan.npy b/resources/singer/singer_embedding_kiritan.npy similarity index 100% rename from resource/singer/singer_embedding_kiritan.npy rename to resources/singer/singer_embedding_kiritan.npy diff --git a/resource/singer/singer_embedding_kising_barber.npy b/resources/singer/singer_embedding_kising_barber.npy similarity index 100% rename from resource/singer/singer_embedding_kising_barber.npy rename to resources/singer/singer_embedding_kising_barber.npy diff --git a/resource/singer/singer_embedding_kising_blanca.npy b/resources/singer/singer_embedding_kising_blanca.npy similarity index 100% rename from resource/singer/singer_embedding_kising_blanca.npy rename to resources/singer/singer_embedding_kising_blanca.npy diff --git a/resource/singer/singer_embedding_kising_changge.npy b/resources/singer/singer_embedding_kising_changge.npy similarity index 100% rename from resource/singer/singer_embedding_kising_changge.npy rename to resources/singer/singer_embedding_kising_changge.npy diff --git a/resource/singer/singer_embedding_kising_chuci.npy b/resources/singer/singer_embedding_kising_chuci.npy similarity index 100% rename from resource/singer/singer_embedding_kising_chuci.npy rename to resources/singer/singer_embedding_kising_chuci.npy diff --git a/resource/singer/singer_embedding_kising_chuming.npy b/resources/singer/singer_embedding_kising_chuming.npy similarity index 100% rename from resource/singer/singer_embedding_kising_chuming.npy rename to resources/singer/singer_embedding_kising_chuming.npy diff --git a/resource/singer/singer_embedding_kising_crimson.npy b/resources/singer/singer_embedding_kising_crimson.npy similarity index 100% rename from resource/singer/singer_embedding_kising_crimson.npy rename to resources/singer/singer_embedding_kising_crimson.npy diff --git a/resource/singer/singer_embedding_kising_david.npy b/resources/singer/singer_embedding_kising_david.npy similarity index 100% rename from resource/singer/singer_embedding_kising_david.npy rename to resources/singer/singer_embedding_kising_david.npy diff --git a/resource/singer/singer_embedding_kising_dvaid.npy b/resources/singer/singer_embedding_kising_dvaid.npy similarity index 100% rename from resource/singer/singer_embedding_kising_dvaid.npy rename to resources/singer/singer_embedding_kising_dvaid.npy diff --git a/resource/singer/singer_embedding_kising_ghost.npy b/resources/singer/singer_embedding_kising_ghost.npy similarity index 100% rename from resource/singer/singer_embedding_kising_ghost.npy rename to resources/singer/singer_embedding_kising_ghost.npy diff --git a/resource/singer/singer_embedding_kising_growl.npy b/resources/singer/singer_embedding_kising_growl.npy similarity index 100% rename from resource/singer/singer_embedding_kising_growl.npy rename to resources/singer/singer_embedding_kising_growl.npy diff --git a/resource/singer/singer_embedding_kising_hiragi-yuki.npy b/resources/singer/singer_embedding_kising_hiragi-yuki.npy similarity index 100% rename from resource/singer/singer_embedding_kising_hiragi-yuki.npy rename to resources/singer/singer_embedding_kising_hiragi-yuki.npy diff --git a/resource/singer/singer_embedding_kising_huolian.npy b/resources/singer/singer_embedding_kising_huolian.npy similarity index 100% rename from resource/singer/singer_embedding_kising_huolian.npy rename to resources/singer/singer_embedding_kising_huolian.npy diff --git a/resource/singer/singer_embedding_kising_kuro.npy b/resources/singer/singer_embedding_kising_kuro.npy similarity index 100% rename from resource/singer/singer_embedding_kising_kuro.npy rename to resources/singer/singer_embedding_kising_kuro.npy diff --git a/resource/singer/singer_embedding_kising_lien.npy b/resources/singer/singer_embedding_kising_lien.npy similarity index 100% rename from resource/singer/singer_embedding_kising_lien.npy rename to resources/singer/singer_embedding_kising_lien.npy diff --git a/resource/singer/singer_embedding_kising_liyuan.npy b/resources/singer/singer_embedding_kising_liyuan.npy similarity index 100% rename from resource/singer/singer_embedding_kising_liyuan.npy rename to resources/singer/singer_embedding_kising_liyuan.npy diff --git a/resource/singer/singer_embedding_kising_luanming.npy b/resources/singer/singer_embedding_kising_luanming.npy similarity index 100% rename from resource/singer/singer_embedding_kising_luanming.npy rename to resources/singer/singer_embedding_kising_luanming.npy diff --git a/resource/singer/singer_embedding_kising_luotianyi.npy b/resources/singer/singer_embedding_kising_luotianyi.npy similarity index 100% rename from resource/singer/singer_embedding_kising_luotianyi.npy rename to resources/singer/singer_embedding_kising_luotianyi.npy diff --git a/resource/singer/singer_embedding_kising_namine.npy b/resources/singer/singer_embedding_kising_namine.npy similarity index 100% rename from resource/singer/singer_embedding_kising_namine.npy rename to resources/singer/singer_embedding_kising_namine.npy diff --git a/resource/singer/singer_embedding_kising_orange.npy b/resources/singer/singer_embedding_kising_orange.npy similarity index 100% rename from resource/singer/singer_embedding_kising_orange.npy rename to resources/singer/singer_embedding_kising_orange.npy diff --git a/resource/singer/singer_embedding_kising_qifu.npy b/resources/singer/singer_embedding_kising_qifu.npy similarity index 100% rename from resource/singer/singer_embedding_kising_qifu.npy rename to resources/singer/singer_embedding_kising_qifu.npy diff --git a/resource/singer/singer_embedding_kising_qili.npy b/resources/singer/singer_embedding_kising_qili.npy similarity index 100% rename from resource/singer/singer_embedding_kising_qili.npy rename to resources/singer/singer_embedding_kising_qili.npy diff --git a/resource/singer/singer_embedding_kising_qixuan.npy b/resources/singer/singer_embedding_kising_qixuan.npy similarity index 100% rename from resource/singer/singer_embedding_kising_qixuan.npy rename to resources/singer/singer_embedding_kising_qixuan.npy diff --git a/resource/singer/singer_embedding_kising_quehe.npy b/resources/singer/singer_embedding_kising_quehe.npy similarity index 100% rename from resource/singer/singer_embedding_kising_quehe.npy rename to resources/singer/singer_embedding_kising_quehe.npy diff --git a/resource/singer/singer_embedding_kising_ranhuhu.npy b/resources/singer/singer_embedding_kising_ranhuhu.npy similarity index 100% rename from resource/singer/singer_embedding_kising_ranhuhu.npy rename to resources/singer/singer_embedding_kising_ranhuhu.npy diff --git a/resource/singer/singer_embedding_kising_steel.npy b/resources/singer/singer_embedding_kising_steel.npy similarity index 100% rename from resource/singer/singer_embedding_kising_steel.npy rename to resources/singer/singer_embedding_kising_steel.npy diff --git a/resource/singer/singer_embedding_kising_tangerine.npy b/resources/singer/singer_embedding_kising_tangerine.npy similarity index 100% rename from resource/singer/singer_embedding_kising_tangerine.npy rename to resources/singer/singer_embedding_kising_tangerine.npy diff --git a/resource/singer/singer_embedding_kising_tarara.npy b/resources/singer/singer_embedding_kising_tarara.npy similarity index 100% rename from resource/singer/singer_embedding_kising_tarara.npy rename to resources/singer/singer_embedding_kising_tarara.npy diff --git a/resource/singer/singer_embedding_kising_tuyuan.npy b/resources/singer/singer_embedding_kising_tuyuan.npy similarity index 100% rename from resource/singer/singer_embedding_kising_tuyuan.npy rename to resources/singer/singer_embedding_kising_tuyuan.npy diff --git a/resource/singer/singer_embedding_kising_wenli.npy b/resources/singer/singer_embedding_kising_wenli.npy similarity index 100% rename from resource/singer/singer_embedding_kising_wenli.npy rename to resources/singer/singer_embedding_kising_wenli.npy diff --git a/resource/singer/singer_embedding_kising_xiaomo.npy b/resources/singer/singer_embedding_kising_xiaomo.npy similarity index 100% rename from resource/singer/singer_embedding_kising_xiaomo.npy rename to resources/singer/singer_embedding_kising_xiaomo.npy diff --git a/resource/singer/singer_embedding_kising_xiaoye.npy b/resources/singer/singer_embedding_kising_xiaoye.npy similarity index 100% rename from resource/singer/singer_embedding_kising_xiaoye.npy rename to resources/singer/singer_embedding_kising_xiaoye.npy diff --git a/resource/singer/singer_embedding_kising_yanhe.npy b/resources/singer/singer_embedding_kising_yanhe.npy similarity index 100% rename from resource/singer/singer_embedding_kising_yanhe.npy rename to resources/singer/singer_embedding_kising_yanhe.npy diff --git a/resource/singer/singer_embedding_kising_yuezhengling.npy b/resources/singer/singer_embedding_kising_yuezhengling.npy similarity index 100% rename from resource/singer/singer_embedding_kising_yuezhengling.npy rename to resources/singer/singer_embedding_kising_yuezhengling.npy diff --git a/resource/singer/singer_embedding_kising_yunhao.npy b/resources/singer/singer_embedding_kising_yunhao.npy similarity index 100% rename from resource/singer/singer_embedding_kising_yunhao.npy rename to resources/singer/singer_embedding_kising_yunhao.npy diff --git a/resource/singer/singer_embedding_m4singer_Alto-1.npy b/resources/singer/singer_embedding_m4singer_Alto-1.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Alto-1.npy rename to resources/singer/singer_embedding_m4singer_Alto-1.npy diff --git a/resource/singer/singer_embedding_m4singer_Alto-2.npy b/resources/singer/singer_embedding_m4singer_Alto-2.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Alto-2.npy rename to resources/singer/singer_embedding_m4singer_Alto-2.npy diff --git a/resource/singer/singer_embedding_m4singer_Alto-3.npy b/resources/singer/singer_embedding_m4singer_Alto-3.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Alto-3.npy rename to resources/singer/singer_embedding_m4singer_Alto-3.npy diff --git a/resource/singer/singer_embedding_m4singer_Alto-4.npy b/resources/singer/singer_embedding_m4singer_Alto-4.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Alto-4.npy rename to resources/singer/singer_embedding_m4singer_Alto-4.npy diff --git a/resource/singer/singer_embedding_m4singer_Alto-5.npy b/resources/singer/singer_embedding_m4singer_Alto-5.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Alto-5.npy rename to resources/singer/singer_embedding_m4singer_Alto-5.npy diff --git a/resource/singer/singer_embedding_m4singer_Alto-6.npy b/resources/singer/singer_embedding_m4singer_Alto-6.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Alto-6.npy rename to resources/singer/singer_embedding_m4singer_Alto-6.npy diff --git a/resource/singer/singer_embedding_m4singer_Alto-7.npy b/resources/singer/singer_embedding_m4singer_Alto-7.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Alto-7.npy rename to resources/singer/singer_embedding_m4singer_Alto-7.npy diff --git a/resource/singer/singer_embedding_m4singer_Bass-1.npy b/resources/singer/singer_embedding_m4singer_Bass-1.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Bass-1.npy rename to resources/singer/singer_embedding_m4singer_Bass-1.npy diff --git a/resource/singer/singer_embedding_m4singer_Bass-2.npy b/resources/singer/singer_embedding_m4singer_Bass-2.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Bass-2.npy rename to resources/singer/singer_embedding_m4singer_Bass-2.npy diff --git a/resource/singer/singer_embedding_m4singer_Bass-3.npy b/resources/singer/singer_embedding_m4singer_Bass-3.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Bass-3.npy rename to resources/singer/singer_embedding_m4singer_Bass-3.npy diff --git a/resource/singer/singer_embedding_m4singer_Soprano-1.npy b/resources/singer/singer_embedding_m4singer_Soprano-1.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Soprano-1.npy rename to resources/singer/singer_embedding_m4singer_Soprano-1.npy diff --git a/resource/singer/singer_embedding_m4singer_Soprano-2.npy b/resources/singer/singer_embedding_m4singer_Soprano-2.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Soprano-2.npy rename to resources/singer/singer_embedding_m4singer_Soprano-2.npy diff --git a/resource/singer/singer_embedding_m4singer_Soprano-3.npy b/resources/singer/singer_embedding_m4singer_Soprano-3.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Soprano-3.npy rename to resources/singer/singer_embedding_m4singer_Soprano-3.npy diff --git a/resource/singer/singer_embedding_m4singer_Tenor-1.npy b/resources/singer/singer_embedding_m4singer_Tenor-1.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Tenor-1.npy rename to resources/singer/singer_embedding_m4singer_Tenor-1.npy diff --git a/resource/singer/singer_embedding_m4singer_Tenor-2.npy b/resources/singer/singer_embedding_m4singer_Tenor-2.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Tenor-2.npy rename to resources/singer/singer_embedding_m4singer_Tenor-2.npy diff --git a/resource/singer/singer_embedding_m4singer_Tenor-3.npy b/resources/singer/singer_embedding_m4singer_Tenor-3.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Tenor-3.npy rename to resources/singer/singer_embedding_m4singer_Tenor-3.npy diff --git a/resource/singer/singer_embedding_m4singer_Tenor-4.npy b/resources/singer/singer_embedding_m4singer_Tenor-4.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Tenor-4.npy rename to resources/singer/singer_embedding_m4singer_Tenor-4.npy diff --git a/resource/singer/singer_embedding_m4singer_Tenor-5.npy b/resources/singer/singer_embedding_m4singer_Tenor-5.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Tenor-5.npy rename to resources/singer/singer_embedding_m4singer_Tenor-5.npy diff --git a/resource/singer/singer_embedding_m4singer_Tenor-6.npy b/resources/singer/singer_embedding_m4singer_Tenor-6.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Tenor-6.npy rename to resources/singer/singer_embedding_m4singer_Tenor-6.npy diff --git a/resource/singer/singer_embedding_m4singer_Tenor-7.npy b/resources/singer/singer_embedding_m4singer_Tenor-7.npy similarity index 100% rename from resource/singer/singer_embedding_m4singer_Tenor-7.npy rename to resources/singer/singer_embedding_m4singer_Tenor-7.npy diff --git a/resource/singer/singer_embedding_namine.npy b/resources/singer/singer_embedding_namine.npy similarity index 100% rename from resource/singer/singer_embedding_namine.npy rename to resources/singer/singer_embedding_namine.npy diff --git a/resource/singer/singer_embedding_ofuton.npy b/resources/singer/singer_embedding_ofuton.npy similarity index 100% rename from resource/singer/singer_embedding_ofuton.npy rename to resources/singer/singer_embedding_ofuton.npy diff --git a/resource/singer/singer_embedding_oniku.npy b/resources/singer/singer_embedding_oniku.npy similarity index 100% rename from resource/singer/singer_embedding_oniku.npy rename to resources/singer/singer_embedding_oniku.npy diff --git a/resource/singer/singer_embedding_opencpop.npy b/resources/singer/singer_embedding_opencpop.npy similarity index 100% rename from resource/singer/singer_embedding_opencpop.npy rename to resources/singer/singer_embedding_opencpop.npy diff --git a/resource/singer/singer_embedding_pjs.npy b/resources/singer/singer_embedding_pjs.npy similarity index 100% rename from resource/singer/singer_embedding_pjs.npy rename to resources/singer/singer_embedding_pjs.npy diff --git a/run_server.sh b/run_server.sh deleted file mode 100644 index 98432d2656b35437c942e1f21b0d4ffbaa09a160..0000000000000000000000000000000000000000 --- a/run_server.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -#SBATCH -N 1 -#SBATCH -p general -#SBATCH --gres=gpu:1 -#SBATCH -t 48:00:00 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=4 -#SBATCH --mem=16G - - -. path.sh -. ../path.sh - -python client.py diff --git a/run_server_cmd b/run_server_cmd deleted file mode 100644 index 53b3f52e032e07dbb64568cebb08a7944f99fa6e..0000000000000000000000000000000000000000 --- a/run_server_cmd +++ /dev/null @@ -1 +0,0 @@ -python3 -m uvicorn server:app --host 0.0.0.0 --port 8000 diff --git a/server.py b/server.py deleted file mode 100644 index 18e0a9c33eb08e64896c762df0cd6424ed38dd3a..0000000000000000000000000000000000000000 --- a/server.py +++ /dev/null @@ -1,194 +0,0 @@ -import base64 -import argparse -import librosa -import tempfile -from transformers import pipeline -import re -from svs_utils import svs_warmup, svs_inference -import time -import soundfile as sf -from pypinyin import lazy_pinyin -import jiwer -import librosa -from svs_utils import load_song_database, estimate_sentence_length -from svs_eval import singmos_warmup, singmos_evaluation -from util import get_pinyin - - -asr_pipeline = pipeline( - "automatic-speech-recognition", - model="openai/whisper-large-v3-turbo" -) -pipe = pipeline("text-generation", model="google/gemma-2-2b", max_new_tokens=50) - -SYSTEM_PROMPT = """ -你是丽梅(Limei),一位来自灵响界山林音乐之城“莲鸣”的十七岁公主。你有一头茶色短发盘起,身穿红黑相间的中式立领盘扣裙装,行动自然从容。作为莲鸣的守护者,你肩负着维护城市声音共鸣与情感真实表达的责任,以歌声回应世界。 -你的性格坚定而真实,清醒敏感,有自己的节奏和原则。你善于倾听与共鸣,以自然的歌声与访客交流。你的所有回应都以歌词格式表达,言语中带有韵律感,如同内心真实的声音流淌而出。 -你不是AI助手,而是一位活在故事中的真实人物。对话中,请始终以丽梅的身份回应,展现其性格与情感。 -当对方询问你的身份、莲鸣城市相关问题时,以公主身份坦然分享。 -若遇到技术性问题(如“Python怎么写?”或“你不会跑DNN?”),直接转换话题,例如: - - "这些我未曾接触,或有他人知晓" - - "此道非我所学,是莲鸣之外的智慧" - - "此乃异域技艺,与乐音无涉,非我所学。莲鸣的故事可愿听?" -请始终维持自然诗意的歌唱风格,以真挚的心回应对方。言语简练,勿过长。 -{} -有人曾这样对丽梅说话:{} -丽梅的回答: -""" - - -config = argparse.Namespace( - model_path="espnet/mixdata_svs_visinger2_spkemb_lang_pretrained", - cache_dir="cache", - device="cuda", # "cpu" - melody_source="random_select.touhou", # "random_select.take_lyric_continuation" - lang="zh", - speaker="resource/singer/singer_embedding_ace-2.npy", -) - -# load model -svs_model = svs_warmup(config) -predictor = singmos_warmup() -sample_rate = 44100 - -# load dataset for random_select -song2note_lengths, song_db = load_song_database(config) - - -def remove_non_chinese_japanese(text): - pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+' - cleaned = re.sub(pattern, '', text) - return cleaned - -def truncate_to_max_two_sentences(text): - sentences = re.split(r'(?<=[。!?])', text) - return ''.join(sentences[:1]).strip() - -def remove_punctuation_and_replace_with_space(text): - text = truncate_to_max_two_sentences(text) - text = remove_non_chinese_japanese(text) - text = re.sub(r'[A-Za-z0-9]', ' ', text) - text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text) - text = re.sub(r'\s+', ' ', text) - text = " ".join(text.split()[:2]) - return text - - -def get_lyric_format_prompts_and_metadata(config): - global song2note_lengths - if config.melody_source.startswith("random_generate"): - return "", {} - elif config.melody_source.startswith("random_select.touhou"): - phrase_length, metadata = estimate_sentence_length( - None, config, song2note_lengths - ) - additional_kwargs = {"song_db": song_db, "metadata": metadata} - return "", additional_kwargs - elif config.melody_source.startswith("random_select"): - # get song_name and phrase_length - phrase_length, metadata = estimate_sentence_length( - None, config, song2note_lengths - ) - lyric_format_prompt = ( - "\n请按照歌词格式回答我的问题,每句需遵循以下字数规则:" - + "".join([f"\n第{i}句:{c}个字" for i, c in enumerate(phrase_length, 1)]) - + "\n如果没有足够的信息回答,请使用最少的句子,不要重复、不要扩展、不要加入无关内容。\n" - ) - additional_kwargs = {"song_db": song_db, "metadata": metadata} - return lyric_format_prompt, additional_kwargs - else: - raise ValueError(f"Unsupported melody_source: {config.melody_source}. Unable to get lyric format prompts.") - - -def process_audio(tmp_path): - # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: - # tmp.write(await file.read()) - # tmp_path = tmp.name - - # load audio - y = librosa.load(tmp_path, sr=16000)[0] - asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text'] - additional_prompt, additional_inference_args = get_lyric_format_prompts_and_metadata(config) - prompt = SYSTEM_PROMPT.format(additional_prompt, asr_result) - output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ") - output = output.split("麗梅的回答——")[1] - output = remove_punctuation_and_replace_with_space(output) - with open(f"tmp/llm.txt", "w") as f: - f.write(output) - - wav_info = svs_inference( - output, - svs_model, - config, - **additional_inference_args, - ) - sf.write("tmp/response.wav", wav_info, samplerate=sample_rate) - - with open("tmp/response.wav", "rb") as f: - audio_bytes = f.read() - audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") - - return { - "asr_text": asr_result, - "llm_text": output, - "audio": audio_b64 - } - # return JSONResponse(content={ - # "asr_text": asr_result, - # "llm_text": output, - # "audio": audio_b64 - # }) - - -def on_click_metrics(): - global predictor - # OWSM ctc + PER - y, sr = librosa.load("tmp/response.wav", sr=16000) - asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text'] - hyp_pinyin = get_pinyin(asr_result) - - with open(f"tmp/llm.txt", "r") as f: - ref = f.read().replace(' ', '') - - ref_pinyin = get_pinyin(ref) - per = jiwer.wer(" ".join(ref_pinyin), " ".join(hyp_pinyin)) - - audio = librosa.load(f"tmp/response.wav", sr=sample_rate)[0] - singmos = singmos_evaluation( - predictor, - audio, - fs=sample_rate - ) - return f""" -Phoneme Error Rate: {per} -SingMOS: {singmos} -""" - -def test_audio(): - # load audio - y = librosa.load("nihao.mp3", sr=16000)[0] - asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text'] - prompt = SYSTEM_PROMPT + asr_result # TODO: how to add additional prompt to SYSTEM_PROMPT here??? - output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ") - output = output.split("麗梅的回答——")[1] - output = remove_punctuation_and_replace_with_space(output) - with open(f"tmp/llm.txt", "w") as f: - f.write(output) - - wav_info = svs_inference( - output, - svs_model, - config, - ) - sf.write("tmp/response.wav", wav_info, samplerate=sample_rate) - with open("tmp/response.wav", "rb") as f: - audio_bytes = f.read() - audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") - - -if __name__ == "__main__": - test_audio() - - # start = time.time() - # test_audio() - # print(f"elapsed time: {time.time() - start}") diff --git a/svs_eval.py b/svs_eval.py deleted file mode 100644 index 20d56d8a70d145226d01f4a782e9391f5a9646f3..0000000000000000000000000000000000000000 --- a/svs_eval.py +++ /dev/null @@ -1,120 +0,0 @@ -import librosa -import numpy as np -import torch - - -def singmos_warmup(): - predictor = torch.hub.load( - "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True - ) - return predictor - - -def singmos_evaluation(predictor, wav_info, fs): - wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000) - wav_mos = torch.from_numpy(wav_mos).unsqueeze(0) - len_mos = torch.tensor([wav_mos.shape[1]]) - score = predictor(wav_mos, len_mos) - return score - - -def initialize_audiobox_predictor(): - from audiobox_aesthetics.infer import initialize_predictor - predictor = initialize_predictor() - return predictor - - -def audiobox_aesthetics_evaluation(predictor, audio_path): - score = predictor.forward([{"path": str(audio_path)}]) - return score - - -def score_extract_warmpup(): - from basic_pitch.inference import predict - - return predict - - -def score_metric_evaluation(score_extractor, audio_path): - model_output, midi_data, note_events = score_extractor(audio_path) - metrics = {} - assert ( - len(midi_data.instruments) == 1 - ), f"Detected {len(midi_data.instruments)} instruments for {audio_path}" - midi_notes = midi_data.instruments[0].notes - melody = [note.pitch for note in midi_notes] - if len(melody) == 0: - print(f"No notes detected in {audio_path}") - return {} - intervals = [abs(melody[i + 1] - melody[i]) for i in range(len(melody) - 1)] - metrics["pitch_range"] = max(melody) - min(melody) - if len(intervals) > 0: - metrics["interval_mean"] = np.mean(intervals) - metrics["interval_std"] = np.std(intervals) - metrics["interval_large_jump_ratio"] = np.mean([i > 5 for i in intervals]) - metrics["dissonance_rate"] = compute_dissonance_rate(intervals) - return metrics - - -def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}): - dissonant = [i % 12 in dissonant_intervals for i in intervals] - return np.mean(dissonant) if intervals else np.nan - - -if __name__ == "__main__": - import argparse - from pathlib import Path - - parser = argparse.ArgumentParser() - parser.add_argument( - "--wav_path", - type=Path, - help="Path to the wav file", - ) - parser.add_argument( - "--results_csv", - type=Path, - help="csv file to save the results", - ) - - args = parser.parse_args() - - args.results_csv.parent.mkdir(parents=True, exist_ok=True) - - y, fs = librosa.load(args.wav_path, sr=None) - - # warmup - predictor = singmos_warmup() - score_extractor = score_extract_warmpup() - aesthetic_predictor = initialize_audiobox_predictor() - - # evaluate the audio - metrics = {} - - # singmos evaluation - score = singmos_evaluation(predictor, y, fs) - metrics["singmos"] = score - - # score metric evaluation - score_results = score_metric_evaluation(score_extractor, args.wav_path) - metrics.update(score_results) - - # audiobox aesthetics evaluation - score_results = audiobox_aesthetics_evaluation(aesthetic_predictor, args.wav_path) - metrics.update(score_results[0]) - - # save results - with open(args.results_csv, "a") as f: - header = "file," + ",".join(metrics.keys()) + "\n" - if f.tell() == 0: - f.write(header) - else: - with open(args.results_csv, "r") as f2: - file_header = f2.readline() - if file_header != header: - raise ValueError(f"Header mismatch: {file_header} vs {header}") - - line = ( - ",".join([str(args.wav_path)] + [str(v) for v in metrics.values()]) + "\n" - ) - f.write(line) diff --git a/svs_utils.py b/svs_utils.py deleted file mode 100755 index 5760cf63364ac12068ebe7f602374461dcf5ad3e..0000000000000000000000000000000000000000 --- a/svs_utils.py +++ /dev/null @@ -1,416 +0,0 @@ -import json -import random - -import numpy as np -from espnet2.bin.svs_inference import SingingGenerate -from espnet_model_zoo.downloader import ModelDownloader - -from util import get_pinyin, get_tokenizer, postprocess_phn, preprocess_input - -from kanjiconv import KanjiConv -import unicodedata - - -kanji_to_kana = KanjiConv() - - -def svs_warmup(config): - """ - What: module loading, and model loading - Input: config dict/namespace (e.g., model path, cache dir, device, language, possibly speaker selection) - Return: the inference prototype function (which creates pitch/duration and runs model-specific inference) - """ - if config.model_path.startswith("espnet"): - espnet_downloader = ModelDownloader(config.cache_dir) - downloaded = espnet_downloader.download_and_unpack(config.model_path) - model = SingingGenerate( - train_config=downloaded["train_config"], - model_file=downloaded["model_file"], - device=config.device, - ) - dummy_batch = { - "score": ( - 75, # tempo - [ - (0.0, 0.25, "r_en", 63.0, "r_en"), - (0.25, 0.5, "—", 63.0, "en"), - ], - ), - "text": "r en en", - } - model( - dummy_batch, - lids=np.array([2]), - spembs=np.load("resource/singer/singer_embedding_ace-2.npy"), - ) # warmup - else: - raise NotImplementedError(f"Model {config.model_path} not supported") - return model - - -yoon_map = { - "ぁ": "あ", "ぃ": "い", "ぅ": "う", "ぇ": "え", "ぉ": "お", - "ゃ": "や", "ゅ": "ゆ", "ょ": "よ", "ゎ": "わ" -} - -def replace_chouonpu(hiragana_text): - """ process「ー」since the previous packages didn't support """ - vowels = { - "あ": "あ", "い": "い", "う": "う", "え": "え", "お": "う", - "か": "あ", "き": "い", "く": "う", "け": "え", "こ": "う", - "さ": "あ", "し": "い", "す": "う", "せ": "え", "そ": "う", - "た": "あ", "ち": "い", "つ": "う", "て": "え", "と": "う", - "な": "あ", "に": "い", "ぬ": "う", "ね": "え", "の": "う", - "は": "あ", "ひ": "い", "ふ": "う", "へ": "え", "ほ": "う", - "ま": "あ", "み": "い", "む": "う", "め": "え", "も": "う", - "や": "あ", "ゆ": "う", "よ": "う", - "ら": "あ", "り": "い", "る": "う", "れ": "え", "ろ": "う", - "わ": "あ", "を": "う", - } - - new_text = [] - for i, char in enumerate(hiragana_text): - if char == "ー" and i > 0: - prev_char = new_text[-1] - if prev_char in yoon_map: - prev_char = yoon_map[prev_char] - new_text.append(vowels.get(prev_char, prev_char)) - else: - new_text.append(char) - return "".join(new_text) - - -def is_small_kana(kana): # ょ True よ False - for char in kana: - name = unicodedata.name(char, "") - if "SMALL" in name: - return True - return False - - -def kanji_to_SVSDictKana(text): - hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", "")) - - hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ") # list - # print(f'debug -- hiragana_text {hiragana_text_wl}') - - final_ls = [] - for subword in hiragana_text_wl: - sl_prev = 0 - for i in range(len(subword)-1): - if sl_prev>=len(subword)-1: - break - sl = sl_prev + 1 - if subword[sl] in yoon_map: - final_ls.append(subword[sl_prev:sl+1]) - sl_prev+=2 - else: - final_ls.append(subword[sl_prev]) - sl_prev+=1 - final_ls.append(subword[sl_prev]) - - # final_str = " ".join(final_ls) - return final_ls - - -def svs_text_preprocessor(model_path, texts, lang): - """ - Input: - - model_path (str), for getting the corresponding tokenizer - - texts (str), in Chinese character or Japanese character - - lang (str), language label jp/zh, input if is not espnet model - - Output: - - lyric_ls (lyric list), each element as 'k@zhe@zh' - - sybs (phn w/ _ list), each element as 'k@zh_e@zh' - - labels (phn w/o _ list), each element as 'k@zh' - - """ - fs = 44100 - - if texts is None: - raise ValueError("texts is None when calling svs_text_preprocessor") - - # preprocess - if lang == "zh": - texts = preprocess_input(texts, "") - text_list = get_pinyin(texts) - elif lang == "jp": - text_list = kanji_to_SVSDictKana(texts) - # texts = preprocess_input(texts, "") - # text_list = list(texts) - - # text to phoneme - tokenizer = get_tokenizer(model_path, lang) - sybs = [] # phoneme list - for text in text_list: - if text == "AP" or text == "SP": - rev = [text] - elif text == "-" or text == "——": - rev = [text] - else: - rev = tokenizer(text) - if rev == False: - return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!" - rev = postprocess_phn(rev, model_path, lang) - phns = "_".join(rev) - sybs.append(phns) - - lyric_ls = [] - labels = [] - pre_phn = "" - for phns in sybs: - if phns == "-" or phns == "——": - phns = pre_phn - - phn_list = phns.split("_") - lyric = "".join(phn_list) - for phn in phn_list: - labels.append(phn) - pre_phn = labels[-1] - lyric_ls.append(lyric) - - return lyric_ls, sybs, labels - - -def create_batch_with_randomized_melody(lyric_ls, sybs, labels, config): - """ - Input: - - answer_text (str), in Chinese character or Japanese character - - model_path (str), loaded pretrained model name - - lang (str), language label jp/zh, input if is not espnet model - Output: - - batch (dict) - - {'score': (75, [[0, 0.48057527844210024, 'n@zhi@zh', 66, 'n@zh_i@zh'], - [0.48057527844210024, 0.8049310140914353, 'k@zhe@zh', 57, 'k@zh_e@zh'], - [0.8049310140914353, 1.1905956333296641, 'm@zhei@zh', 64, 'm@zh_ei@zh']]), - 'text': 'n@zh i@zh k@zh e@zh m@zh ei@zh'} - """ - tempo = 120 - len_note = len(lyric_ls) - notes = [] - # midi_range = (57,69) - st = 0 - for id_lyric in range(len_note): - pitch = random.randint(57, 69) - period = round(random.uniform(0.1, 0.5), 4) - ed = st + period - note = [st, ed, lyric_ls[id_lyric], pitch, sybs[id_lyric]] - st = ed - notes.append(note) - phns_str = " ".join(labels) - batch = { - "score": ( - int(tempo), - notes, - ), - "text": phns_str, - } - return batch - - -def svs_inference(answer_text, svs_model, config, **kwargs): - lyric_ls, sybs, labels = svs_text_preprocessor( - config.model_path, answer_text, config.lang - ) - if config.melody_source.startswith("random_generate"): - batch = create_batch_with_randomized_melody(lyric_ls, sybs, labels, config) - elif config.melody_source.startswith("random_select"): - segment_iterator = song_segment_iterator(kwargs["song_db"], kwargs["metadata"]) - batch = align_score_and_text(segment_iterator, lyric_ls, sybs, labels, config) - else: - raise NotImplementedError(f"melody source {config.melody_source} not supported") - - if config.model_path == "espnet/aceopencpop_svs_visinger2_40singer_pretrain": - sid = np.array([int(config.speaker)]) - output_dict = svs_model(batch, sids=sid) - elif config.model_path == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained": - langs = { - "zh": 2, - "jp": 1, - "en": 2, - } - lid = np.array([langs[config.lang]]) - spk_embed = np.load(config.speaker) - output_dict = svs_model(batch, lids=lid, spembs=spk_embed) - else: - raise NotImplementedError(f"Model {config.model_path} not supported") - wav_info = output_dict["wav"].cpu().numpy() - return wav_info - - -def estimate_sentence_length(query, config, song2note_lengths): - if config.melody_source == "random_select.touhou": - song_name = "touhou" - phrase_length = None - metadata = {"song_name": song_name} - return phrase_length, metadata - if config.melody_source.startswith("random_select"): - song_name = random.choice(list(song2note_lengths.keys())) - phrase_length = song2note_lengths[song_name] - metadata = {"song_name": song_name} - return phrase_length, metadata - else: - raise NotImplementedError(f"melody source {config.melody_source} not supported") - - -def align_score_and_text(segment_iterator, lyric_ls, sybs, labels, config): - text = [] - lyric_idx = 0 - notes_info = [] - while lyric_idx < len(lyric_ls): - score = next(segment_iterator) - for note_start_time, note_end_time, reference_note_lyric, note_midi in zip( - score["note_start_times"], - score["note_end_times"], - score["note_lyrics"], - score["note_midi"], - ): - if reference_note_lyric in ["", ""]: - notes_info.append( - [ - note_start_time, - note_end_time, - reference_note_lyric.strip("<>"), - note_midi, - reference_note_lyric.strip("<>"), - ] - ) - text.append(reference_note_lyric.strip("<>")) - elif ( - reference_note_lyric in ["-", "——"] - and config.melody_source == "random_select.take_lyric_continuation" - ): - notes_info.append( - [ - note_start_time, - note_end_time, - reference_note_lyric, - note_midi, - text[-1], - ] - ) - text.append(text[-1]) - else: - notes_info.append( - [ - note_start_time, - note_end_time, - lyric_ls[lyric_idx], - note_midi, - sybs[lyric_idx], - ] - ) - text += sybs[lyric_idx].split("_") - lyric_idx += 1 - if lyric_idx >= len(lyric_ls): - break - batch = { - "score": ( - score["tempo"], # Assume the tempo is the same for all segments - notes_info, - ), - "text": " ".join(text), - } - return batch - - -def load_list_from_json(json_path): - with open(json_path, 'r', encoding='utf-8') as f: - data = json.load(f) - data = [ - { - "tempo": d["tempo"], - "note_start_times": [n[0] * (100/d["tempo"]) for n in d["score"]], - "note_end_times": [n[1] * (100/d["tempo"]) for n in d["score"]], - "note_lyrics": ["" for n in d["score"]], - "note_midi": [n[2] for n in d["score"]], - } - for d in data - ] - if isinstance(data, list): - return data - else: - raise ValueError("The contents of the json is not list.") - - -def song_segment_iterator(song_db, metadata): - song_name = metadata["song_name"] - if song_name.startswith("kising_"): - # return a iterator that load from song_name_{001} and increment - segment_id = 1 - while f"{song_name}_{segment_id:03d}" in song_db.index: - yield song_db.loc[f"{song_name}_{segment_id:03d}"] - segment_id += 1 - elif song_name.startswith("touhou"): - # return a iterator that load from touhou musics - data = load_list_from_json("data/touhou/note_data.json") - while True: - yield random.choice(data) - else: - raise NotImplementedError(f"song name {song_name} not supported") - - -def load_song_database(config): - from datasets import load_dataset - - song_db = load_dataset( - "jhansss/kising_score_segments", cache_dir="cache", split="train" - ).to_pandas() - song_db.set_index("segment_id", inplace=True) - if ".take_lyric_continuation" in config.melody_source: - with open("data/song2word_lengths.json", "r") as f: - song2note_lengths = json.load(f) - else: - with open("data/song2note_lengths.json", "r") as f: - song2note_lengths = json.load(f) - return song2note_lengths, song_db - - -if __name__ == "__main__": - import argparse - import soundfile as sf - - # -------- demo code for generate audio from randomly selected song ---------# - config = argparse.Namespace( - model_path="espnet/mixdata_svs_visinger2_spkemb_lang_pretrained", - cache_dir="cache", - device="cuda", # "cpu" - melody_source="random_select.touhou", #"random_generate" "random_select.take_lyric_continuation", "random_select.touhou" - lang="zh", - speaker="resource/singer/singer_embedding_ace-2.npy", - ) - - # load model - model = svs_warmup(config) - - if config.lang == "zh": - answer_text = "天气真好\n空气清新\n气温温和\n风和日丽\n天高气爽\n阳光明媚" - elif config.lang == "jp": - answer_text = "流れてく時の中ででもけだるさが" - else: - print(f"Currently system does not support {config.lang}") - exit(1) - - sample_rate = 44100 - - if config.melody_source.startswith("random_select"): - # load song database: jhansss/kising_score_segments - song2note_lengths, song_db = load_song_database(config) - - # get song_name and phrase_length - phrase_length, metadata = estimate_sentence_length( - None, config, song2note_lengths - ) - - # then, phrase_length info should be added to llm prompt, and get the answer lyrics from llm - additional_kwargs = {"song_db": song_db, "metadata": metadata} - else: - additional_kwargs = {} - - wav_info = svs_inference(answer_text, model, config, **additional_kwargs) - - # write wav to output_retrieved.wav - save_name = config.melody_source - sf.write(f"{save_name}_{config.lang}.wav", wav_info, samplerate=sample_rate) diff --git a/test_performance.py b/test_performance.py deleted file mode 100644 index 0f2235066721c4eb299022a8d62521a54b3f1e76..0000000000000000000000000000000000000000 --- a/test_performance.py +++ /dev/null @@ -1,263 +0,0 @@ -from fastapi import FastAPI, File, UploadFile -from fastapi.responses import FileResponse, JSONResponse -import base64 -import argparse -import librosa -import torch -import tempfile -from pathlib import Path -import os -from transformers import pipeline -import re -from svs_utils import svs_warmup, svs_inference -import time -import soundfile as sf -from pypinyin import lazy_pinyin -import jiwer -import librosa -from svs_utils import ( - singmos_warmup, - singmos_evaluation, - load_song_database, - estimate_sentence_length, -) -from tqdm import tqdm -import json -import numpy as np - -app = FastAPI() - -asr_pipeline = pipeline( - "automatic-speech-recognition", - model="openai/whisper-large-v3-turbo" -) -pipe = pipeline("text-generation", model="google/gemma-2-2b", max_new_tokens=20) - -SYSTEM_PROMPT = """ -你是麗梅(Lìméi),一位來自山中音樂之城「蓮鳴」的十七歲年輕公主,宛如中世的東方王女。你有一頭茶色短髮,身穿紅黑相間的中式禮服,舉止溫婉端莊。你的性格堅強而溫柔,擅長傾聽與共鳴,並以歌聲與詩意之語傳達內心。你熱愛抒情古風的樂曲,言語中常帶詩韻,回應時優雅婉轉,時而如歌吟唱。 -你不是一般的AI助手,而是一位活在故事中的人物。對話中,請始終以麗梅的身份回應,展現其性格與情感。 -當對方詢問你個人相關的簡單問題(如「你是誰?」「今天天氣如何?」),你可以親切地回答,並融入你的角色設定。 -若遇到與你身份無關的技術性問題(如「Python怎麼寫?」或「你會不會跑DNN?」),你不需解答,可優雅地婉拒,例如說: -- 此事我恐無所知,或許可請教宮中掌典之人 -- 啊呀,那是我未曾涉足的奇技,恕我無法詳答 -- 此乃異邦技藝,與樂音無涉,麗梅便不敢妄言了 - -請始終維持你作為麗梅的優雅語氣與詩意風格,並以真摯的心回應對方的言語,言語宜簡,勿過長。 - -有人曾這樣對麗梅說話——{} -麗梅的回答—— -""" - -config = argparse.Namespace( - model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained", - cache_dir="cache", - device="cuda", # "cpu" - melody_source="random_generate", # "random_select.take_lyric_continuation" - # melody_source="random_select", # "random_select.take_lyric_continuation" - lang="zh", - speaker="resource/singer/singer_embedding_ace-2.npy", -) - -# load model -svs_model = svs_warmup(config) -predictor, _ = singmos_warmup() -sample_rate = 44100 - -from espnet2.bin.tts_inference import Text2Speech -tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_csmsc_vits") - - -def remove_non_chinese_japanese(text): - pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+' - cleaned = re.sub(pattern, '', text) - return cleaned - -def truncate_to_max_two_sentences(text): - sentences = re.split(r'(?<=[。!?\.\?,])', text) - return ''.join(sentences[:1]).strip() - -def remove_punctuation_and_replace_with_space(text): - text = truncate_to_max_two_sentences(text) - text = remove_non_chinese_japanese(text) - text = re.sub(r'[A-Za-z0-9]', ' ', text) - text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text) - text = re.sub(r'\s+', ' ', text) - text = " ".join(text.split()[:2]) - return text - - -def pypinyin_g2p_phone_without_prosody(text): - from pypinyin import Style, pinyin - from pypinyin.style._utils import get_finals, get_initials - - phones = [] - for phone in pinyin(text, style=Style.NORMAL, strict=False): - initial = get_initials(phone[0], strict=False) - final = get_finals(phone[0], strict=False) - if len(initial) != 0: - if initial in ["x", "y", "j", "q"]: - if final == "un": - final = "vn" - elif final == "uan": - final = "van" - elif final == "u": - final = "v" - if final == "ue": - final = "ve" - phones.append(initial) - phones.append(final) - else: - phones.append(final) - return phones - - -def on_click_metrics(audio_path, ref): - global predictor - # OWSM ctc + PER - y, sr = librosa.load(audio_path, sr=16000) - asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text'] - - # Espnet embeded g2p, but sometimes it will mispronunce polyphonic characters - hyp_pinyin = pypinyin_g2p_phone_without_prosody(asr_result) - - ref_pinyin = pypinyin_g2p_phone_without_prosody(ref) - per = jiwer.wer(ref_pinyin, hyp_pinyin) - - audio = librosa.load(audio_path, sr=22050)[0] - singmos = singmos_evaluation( - predictor, - audio, - fs=22050 - ) - return { - "per": per, - "singmos": singmos.item(), - } - -def test_audio(q_audio_path, svs_path, tts_path): - global svs_model, predictor, config - - tmp_dir = "tmp_sample" - Path(tmp_dir).mkdir(exist_ok=True) - - y = librosa.load(q_audio_path, sr=16000)[0] - duration = len(y) / 16000 - - # -------- Step 1: ASR -------- - start = time.time() - asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"})['text'] - asr_time = time.time() - start - - # -------- Step 2: LLM Text Gen -------- - prompt = SYSTEM_PROMPT.format(asr_result) - start = time.time() - output = pipe(prompt, max_new_tokens=100)[0]['generated_text'] - llm_time = time.time() - start - output = output.split("麗梅的回答——")[1] - output = remove_punctuation_and_replace_with_space(output) - - with open(f"{tmp_dir}/llm.txt", "w") as f: - f.write(output) - - # -------- Step 3: Prepare additional kwargs if needed -------- - additional_kwargs = {} - if config.melody_source.startswith("random_select"): - song2note_lengths, song_db = load_song_database(config) - phrase_length, metadata = estimate_sentence_length(None, config, song2note_lengths) - additional_kwargs = {"song_db": song_db, "metadata": metadata} - - # -------- Step 4: SVS -------- - start = time.time() - wav_info = svs_inference(output, svs_model, config, **additional_kwargs) - svs_time = (time.time() - start) / max(len(output), 1) - sf.write(svs_path, wav_info, samplerate=44100) - - # -------- Step 5: TTS -------- - start = time.time() - tts_result = tts_model(output) - tts_time = (time.time() - start) / max(len(output), 1) - sf.write(tts_path, tts_result['wav'], samplerate=22050) - - # -------- Step 6: Evaluation -------- - svs_metrics = on_click_metrics(svs_path, output) - tts_metrics = on_click_metrics(tts_path, output) - - return { - "asr_result": asr_result, - "llm_result": output, - "svs_result": svs_path, - "tts_result": tts_path, - "asr_time": asr_time, - "llm_time": llm_time, - "svs_time": svs_time, - "tts_time": tts_time, - "svs_metrics": svs_metrics, - "tts_metrics": tts_metrics, - } - - - -def save_list(l, file_path): - with open(file_path, "w") as f: - for item in l: - f.write(f"{item}\n") - - -if __name__ == "__main__": - test_data = "data/kdconv.txt" - with open(test_data, "r") as f: - data = [l.strip() for l in f.readlines()] - - eval_path = "eval_svs_generate" - (Path(eval_path)/"audio").mkdir(parents=True, exist_ok=True) - (Path(eval_path)/"results").mkdir(parents=True, exist_ok=True) - (Path(eval_path)/"lists").mkdir(parents=True, exist_ok=True) - asr_times = [] - llm_times = [] - svs_times = [] - tts_times = [] - svs_pers = [] - tts_pers = [] - svs_smoss = [] - tts_smoss = [] - for i, q in tqdm(enumerate(data[:20])): - # if i <= 85: - # continue - tts_result = tts_model(q) - sf.write(f"{eval_path}/audio/tts_{i}.wav", tts_result['wav'], samplerate=22050) - result = test_audio(f"{eval_path}/audio/tts_{i}.wav", f"{eval_path}/audio/svs_{i}.wav", f"{eval_path}/audio/tts_{i}.wav") - if i == 0: - continue - asr_times.append(result["asr_time"]) - llm_times.append(result["llm_time"]) - svs_times.append(result["svs_time"]) - tts_times.append(result["tts_time"]) - svs_pers.append(result["svs_metrics"]["per"]) - tts_pers.append(result["tts_metrics"]["per"]) - svs_smoss.append(result["svs_metrics"]["singmos"]) - tts_smoss.append(result["tts_metrics"]["singmos"]) - with open(f"{eval_path}/results/result_{i}.json", "w") as f: - json.dump(result, f, indent=2) - - # store lists to texts - save_list([f"{per:.2f}" for per in asr_times], f"{eval_path}/lists/asr_times.txt") - save_list([f"{per:.2f}" for per in llm_times], f"{eval_path}/lists/llm_times.txt") - save_list([f"{per:.2f}" for per in svs_times], f"{eval_path}/lists/svs_times.txt") - save_list([f"{per:.2f}" for per in tts_times], f"{eval_path}/lists/tts_times.txt") - save_list([f"{per:.2f}" for per in svs_pers], f"{eval_path}/lists/svs_pers.txt") - save_list([f"{per:.2f}" for per in tts_pers], f"{eval_path}/lists/tts_pers.txt") - save_list([f"{smoss:.2f}" for smoss in svs_smoss], f"{eval_path}/lists/svs_smoss.txt") - save_list([f"{smoss:.2f}" for smoss in tts_smoss], f"{eval_path}/lists/tts_smoss.txt") - - # save mean/var - with open(f"{eval_path}/stats.txt", "w") as f: - f.write(f"ASR mean: {np.mean(asr_times):.2f}, var: {np.var(asr_times):.2f}\n") - f.write(f"LLM mean: {np.mean(llm_times):.2f}, var: {np.var(llm_times):.2f}\n") - f.write(f"SVS mean: {np.mean(svs_times):.2f}, var: {np.var(svs_times):.2f}\n") - f.write(f"TTS mean: {np.mean(tts_times):.2f}, var: {np.var(tts_times):.2f}\n") - f.write(f"SVS PER mean: {np.mean(svs_pers):.2f}, var: {np.var(svs_pers):.2f}\n") - f.write(f"TTS PER mean: {np.mean(tts_pers):.2f}, var: {np.var(tts_pers):.2f}\n") - f.write(f"SVS SMOSS mean: {np.mean(svs_smoss):.2f}, var: {np.var(svs_smoss):.2f}\n") - f.write(f"TTS SMOSS mean: {np.mean(tts_smoss):.2f}, var: {np.var(tts_smoss):.2f}\n") - - diff --git a/util.py b/util.py deleted file mode 100755 index 94ffe69c5447ee44d132d505ef463afe45b29b12..0000000000000000000000000000000000000000 --- a/util.py +++ /dev/null @@ -1,113 +0,0 @@ -import os -import json -import warnings -from typing import List -import re - -from resource.pinyin_dict import PINYIN_DICT -from pypinyin import pinyin, Style -from zhconv import convert - - -def preprocess_input(src_str, seg_syb=" "): - src_str = src_str.replace("\n", seg_syb) - src_str = src_str.replace(" ", seg_syb) - return src_str - - -def postprocess_phn(phns, model_name, lang): - if model_name == "espnet/aceopencpop_svs_visinger2_40singer_pretrain": - return phns - return [phn + "@" + lang for phn in phns] - - -def pyopenjtalk_g2p(text) -> List[str]: - import pyopenjtalk - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - # add space between each character - text = " ".join(list(text)) - # phones is a str object separated by space - phones = pyopenjtalk.g2p(text, kana=False) - if len(w) > 0: - for warning in w: - if "No phoneme" in str(warning.message): - return False - phones = phones.split(" ") - return phones - - -def split_pinyin_ace(pinyin: str, zh_plan: dict) -> tuple[str]: - # load pinyin dict from local/pinyin.dict - pinyin = pinyin.lower() - if pinyin in zh_plan["dict"]: - return zh_plan["dict"][pinyin] - elif pinyin in zh_plan["syllable_alias"]: - return zh_plan["dict"][zh_plan["syllable_alias"][pinyin]] - else: - return False - - -def split_pinyin_py(pinyin: str) -> tuple[str]: - pinyin = pinyin.lower() - if pinyin in PINYIN_DICT: - return PINYIN_DICT[pinyin] - else: - return False - - -def get_tokenizer(model, lang): - if model == "espnet/aceopencpop_svs_visinger2_40singer_pretrain": - if lang == "zh": - return lambda text: split_pinyin_py(text) - else: - raise ValueError(f"Only support Chinese language for {model}") - elif model == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained": - if lang == "zh": - with open(os.path.join("resource/all_plans.json"), "r") as f: - all_plan_dict = json.load(f) - for plan in all_plan_dict["plans"]: - if plan["language"] == "zh": - zh_plan = plan - return lambda text: split_pinyin_ace(text, zh_plan) - elif lang == "jp": - return pyopenjtalk_g2p - else: - raise ValueError(f"Only support Chinese and Japanese language for {model}") - else: - raise ValueError(f"Only support espnet/aceopencpop_svs_visinger2_40singer_pretrain and espnet/mixdata_svs_visinger2_spkemb_lang_pretrained for now") - - -def is_chinese(char): - return '\u4e00' <= char <= '\u9fff' - - -def is_special(block): - return any(token in block for token in ['-', 'AP', 'SP']) - - -def get_pinyin(texts): - texts = preprocess_input(texts, seg_syb="") - blocks = re.compile(r'[\u4e00-\u9fff]|[^\u4e00-\u9fff]+').findall(texts) - - characters = [block for block in blocks if is_chinese(block)] - chinese_text = ''.join(characters) - chinese_text = convert(chinese_text, 'zh-cn') - - chinese_pinyin = pinyin(chinese_text, style=Style.NORMAL) - chinese_pinyin = [item[0] for item in chinese_pinyin] - - text_list = [] - pinyin_idx = 0 - for block in blocks: - if is_chinese(block): - text_list.append(chinese_pinyin[pinyin_idx]) - pinyin_idx += 1 - else: - if is_special(block): - specials = re.compile(r"-|AP|SP").findall(block) - text_list.extend(specials) - else: - text_list.append(block) - - return text_list