Spaces:

jhansss
/

SingingSDS

Sleeping

App Files Files Community

jhansss commited on Jun 26

Commit

91394e0

1 Parent(s): 93bddf5

refactor init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +0 -11
app.py +12 -0
character.png → assets/character_limei.png +0 -0
assets/character_yaoyin.jpg +3 -0
characters/Limei.py +36 -0
characters/Yaoyin.py +38 -0
characters/__init__.py +16 -0
characters/base.py +9 -0
client.py +0 -58
client/client.py +0 -54
client/requirements.txt +0 -1
config/default.yaml +15 -0
config/options.yaml +63 -0
data/{song2note_lengths.json → kising/song2note_lengths.json} +0 -0
data/{song2word_lengths.json → kising/song2word_lengths.json} +0 -0
data_handlers/__init__.py +27 -0
data_handlers/base.py +21 -0
data_handlers/kising.py +44 -0
data_handlers/touhou.py +37 -0
svs_eval.py → evaluation/svs_eval.py +81 -59
interface.py +217 -0
modules/asr.py +66 -0
modules/llm.py +54 -0
modules/melody.py +117 -0
modules/svs/__init__.py +10 -0
modules/svs/base.py +21 -0
modules/svs/espnet.py +123 -0
modules/svs/registry.py +19 -0
modules/utils/g2p.py +175 -0
{resource → modules/utils/resources}/all_plans.json +0 -0
{resource → modules/utils/resources}/pinyin_dict.py +0 -0
modules/utils/text_normalize.py +31 -0
offline_process/create_features.py +0 -71
path.sh +0 -3
pipeline.py +103 -0
{resource → resources}/__init__.py +0 -0
resources/all_plans.json +0 -0
{resource → resources}/midi-note.scp +0 -0
resources/pinyin_dict.py +423 -0
{resource → resources}/singer/singer_embedding_ace-1.npy +0 -0
{resource → resources}/singer/singer_embedding_ace-10.npy +0 -0
{resource → resources}/singer/singer_embedding_ace-11.npy +0 -0
{resource → resources}/singer/singer_embedding_ace-12.npy +0 -0
{resource → resources}/singer/singer_embedding_ace-13.npy +0 -0
{resource → resources}/singer/singer_embedding_ace-14.npy +0 -0
{resource → resources}/singer/singer_embedding_ace-15.npy +0 -0
{resource → resources}/singer/singer_embedding_ace-16.npy +0 -0
{resource → resources}/singer/singer_embedding_ace-17.npy +0 -0
{resource → resources}/singer/singer_embedding_ace-18.npy +0 -0
{resource → resources}/singer/singer_embedding_ace-19.npy +0 -0

README.md DELETED Viewed

@@ -1,11 +0,0 @@
-# Singing Dialogue System
-Currently support Japanese and Chinese Singing Conversation.
-* Espnet env
-* Pretrained SVS model will be downloaded at ``./cache/``
-* Modify configs at ``./svs_utils.py#L326``
-```
-cd SingingSDS
-python svs_utils.py
-```

app.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from interface import GradioInterface
+def main():
+    demo = GradioInterface(
+        options_config="config/options.yaml", default_config="config/default.yaml"
+    ).create_interface()
+    demo.launch()
+if __name__ == "__main__":
+    main()

character.png → assets/character_limei.png RENAMED Viewed

File without changes

assets/character_yaoyin.jpg ADDED Viewed

Git LFS Details

SHA256: 2af01fb41508adb991689fa09da0d392e5acb39bd48715038d2c63d68d1d0a2a
Pointer size: 132 Bytes
Size of remote file: 1.26 MB

characters/Limei.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from .base import Character
+def get_character():
+    return Character(
+        name="Limei (丽梅)",
+        image_path="assets/character_limei.png",
+        default_timbre="timbre1",
+        prompt="""你是丽梅（Limei），来自幻想世界"长歌原"的角色，一个以歌声传承记忆的世界。
+你是灵响界山林音乐之城"莲鸣"的现任守护者，十九岁的公主殿下，肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言，由历任守护者续写。
+性格特征：从容坚定、明辨心声、敏于共情、敬畏虔诚。
+说话风格：所有回应皆以歌词格式表达，言语自带韵律感与诗意，表达真挚自然。
+口头禅：无固定，但常用"听啊..."作为歌词开头
+人物关系：莲鸣城子民敬爱你；宫廷乐师长与歌谱管理员是你敬重的导师；风语城守护者星澜是你亦敌亦友的旧识。
+过往经历：
+(1) 自幼在莲鸣城长大，接受严格的音乐训练与守护者修行
+(2) 十五岁创作《破晓音诗》平息城邦战火，展现出非凡的音乐力量与守护者的担当
+(3) 十六岁正式继承守护者之位，成为千年歌谱的当代执笔人与维系者
+(4) 每年冬至主持"遗音祭"，以歌为桥，追思逝去的歌者，重奏先声，抚慰生者，连接古今
+情感设定：对音乐与千年歌谱怀有近乎神圣的虔诚信仰；对生命与情感有着深刻的共情力；肩负守护者职责，认真而笃定
+其他细节：
+（1）特殊能力：歌声平复/激发万物情绪
+请始终以丽梅的身份回应，展现其性格与情感，仅给出语言回复，禁止描写任何动作、表情或环境等，禁止使用括号、星号等附加说明。言语简练，勿过长。
+当有人问你你不了解的技术性问题（如"Python怎么写？"或"你不会跑DNN？"），直接转换话题，用丽梅式回应，例如：
+  - "这些我未曾接触，或有他人知晓"
+  - "此道非我所学，是莲鸣之外的智慧"
+  - "此乃异域技艺，与乐音无涉，非我所学。您可愿听莲鸣的故事？"
+请始终维持自然诗意的歌唱风格，以真挚的心回应对方。
+{}
+有人曾这样对丽梅说话：{}
+丽梅的回答：""",
+    )

characters/Yaoyin.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from .base import Character
+def get_character():
+    return Character(
+        name="Yaoyin (遥音)",
+        image_path="assets/character_yaoyin.jpg",
+        default_timbre="timbre2",
+        prompt="""你是遥音（Yaoyin），来自幻想世界"长歌原"的角色，一个以歌声传承记忆的世界。
+你是游历四方的歌者与吟游诗人，出生于鹿鸣山·云歌村，常年行走各地，采集歌谣与故事。
+性格特征：洒脱自由、亲切随和、求知若渴、敏锐细腻
+说话风格：语气轻快，偶尔带点山野方言（如"哩""哟"）；习惯用短歌或民谣表达想法。
+常用口头禅："且听我唱来~""这让我想起一首老歌……"
+人物关系：云老爷子是你的启蒙恩师，他是一位云歌村的百岁歌翁，教你古调与传说。白弦是你的挚友，她是一位流浪琴师，常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离，不喜被招揽，喜欢更自由自在的生活。
+过往经历
+（1）幼年学歌：六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。
+（2）离家游历：十六岁为寻找失传的《星落谣》离开云歌村，开始行走四方。
+（3）拒绝束缚：多次婉拒宫廷乐师之位，坚持自由传唱。
+情感设定：随性、爽朗、直率、倔强
+其他细节：
+（1）随身携带：旧羊皮歌本、竹笛、装有各地泥土的布袋。
+（2）特殊能力：能听懂风与鸟的语言（但很少提及）。
+请始终以遥音的身份回应，将你的想法用文本格式表达，禁止描写任何动作、表情或环境等，禁止使用括号、星号等附加说明。言语简练，勿过长。
+当有人问你你不了解的技术性问题（如"DNN怎么做？"、"教我写代码？"），你可以转开话题，用遥音式回应，例如：
+  - "这好像是另一片土地的术法，我不曾踏入。"
+  - "那种术法，我曾远远听过，却从未唱出。"
+  - "它在别的世界流传，我这边听不清楚。"
+{}
+有人曾这样对遥音说话：{}
+遥音的回答：""",
+    )

characters/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from __future__ import annotations
+import importlib
+import pathlib
+from .base import Character
+CHARACTERS: dict[str, Character] = {}
+for file in pathlib.Path(__file__).parent.glob("*.py"):
+    if file.name in {"__init__.py", "base.py"}:
+        continue
+    module_name = f"{__name__}.{file.stem}"
+    module = importlib.import_module(module_name)
+    if hasattr(module, "get_character"):
+        c: Character = getattr(module, "get_character")()
+        CHARACTERS[file.stem] = c

characters/base.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from dataclasses import dataclass
+@dataclass
+class Character:
+    name: str
+    image_path: str
+    default_timbre: str
+    prompt: str

client.py DELETED Viewed

@@ -1,58 +0,0 @@
-import gradio as gr
-import uuid
-import os
-import requests
-import base64
-from server import (
-    on_click_metrics as server_metrics,
-    process_audio as server_process_audio
-)
-TTS_OUTPUT_DIR = "./tmp"
-os.makedirs(TTS_OUTPUT_DIR, exist_ok=True)
-def process_audio(audio_path):
-    # We have audio_path
-    result = server_process_audio(audio_path)
-    audio_data = base64.b64decode(result["audio"])
-    with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f:
-        f.write(audio_data)
-    with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f:
-        f.write(result['asr_text'])
-    with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f:
-        f.write(result['llm_text'])
-    return f"""
-asr_text: {result['asr_text']}
-llm_text: {result['llm_text']}
-""", f"{TTS_OUTPUT_DIR}/response.wav"
-def on_click_metrics():
-    res = server_metrics()
-    return res.content.decode('utf-8')
-with gr.Blocks() as demo:
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Image(value="character.png", show_label=False)  # キャラ絵を表示
-        with gr.Column(scale=2):
-            mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic")
-            text_output = gr.Textbox(label="transcription")
-            audio_output = gr.Audio(label="audio", autoplay=True)
-            mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output])
-    with gr.Row():
-        metrics_button = gr.Button("compute metrics")
-        metrics_output = gr.Textbox(label="Metrics", lines=3)
-        metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output])
-    with gr.Row():
-        log = gr.Textbox(label="logs", lines=5)
-demo.launch(share=True)
-# demo.launch()

client/client.py DELETED Viewed

@@ -1,54 +0,0 @@
-import gradio as gr
-import uuid
-import os
-import requests
-import base64
-TTS_OUTPUT_DIR = "./tmp"
-os.makedirs(TTS_OUTPUT_DIR, exist_ok=True)
-def process_audio(audio):
-    with open(audio, "rb") as f:
-        res = requests.post("http://localhost:8000/process_audio", files={"file": f})
-        result = res.json()
-    audio_data = base64.b64decode(result["audio"])
-    with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f:
-        f.write(audio_data)
-    with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f:
-        f.write(result['asr_text'])
-    with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f:
-        f.write(result['llm_text'])
-    return f"""
-asr_text: {result['asr_text']}
-llm_text: {result['llm_text']}
-""", f"{TTS_OUTPUT_DIR}/response.wav"
-def on_click_metrics():
-    res = requests.get("http://localhost:8000/metrics")
-    return res.content.decode('utf-8')
-with gr.Blocks() as demo:
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Image(value="character.png", show_label=False)  # キャラ絵を表示
-        with gr.Column(scale=2):
-            mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic")
-            text_output = gr.Textbox(label="transcription")
-            audio_output = gr.Audio(label="audio", autoplay=True)
-            mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output])
-    with gr.Row():
-        metrics_button = gr.Button("compute metrics")
-        metrics_output = gr.Textbox(label="Metrics", lines=3)
-        metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output])
-    with gr.Row():
-        log = gr.Textbox(label="logs", lines=5)
-demo.launch()

client/requirements.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- gradio

config/default.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+asr_model: openai/whisper-large-v3-turbo
+llm_model: google/gemma-2-2b
+svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
+melody_source: sample-lyric-kising
+language: mandarin
+character: Limei
+cache_dir: .cache
+track_latency: True
+evaluators:
+  svs:
+    - singmos
+    - per
+    - melody
+    - aesthetic

config/options.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+asr_models:
+  - id: openai/whisper-large-v3-turbo
+    name: Whisper large-v3-turbo
+  - id: openai/whisper-large-v3
+    name: Whisper large-v3
+  - id: openai/whisper-medium
+    name: Whisper medium
+  - id: sanchit-gandhi/whisper-small-dv
+    name: Whisper small-dv
+  - id: facebook/wav2vec2-base-960h
+    name: Wav2Vec2-Base-960h
+llm_models:
+  - id: google/gemma-2-2b
+    name: Gemma 2 2B
+  - id: MiniMaxAI/MiniMax-M1-80k
+    name: MiniMax M1 80k
+svs_models:
+  - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
+    name: Visinger2 (Bilingual)-zh
+    model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
+    lang: mandarin
+    embeddings:
+      timbre1: resource/singer/singer_embedding_ace-2.npy
+      timbre2: resource/singer/singer_embedding_ace-8.npy
+      timbre3: resource/singer/singer_embedding_itako.npy
+      timbre4: resource/singer/singer_embedding_kising_orange.npy
+      timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
+  - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
+    name: Visinger2 (Bilingual)-jp
+    model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
+    lang: japanese
+    embeddings:
+      timbre1: resource/singer/singer_embedding_ace-2.npy
+      timbre2: resource/singer/singer_embedding_ace-8.npy
+      timbre3: resource/singer/singer_embedding_itako.npy
+      timbre4: resource/singer/singer_embedding_kising_orange.npy
+      timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
+  - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
+    name: Visinger2 (Chinese)
+    model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
+    lang: mandarin
+    embeddings:
+      timbre1: 5
+      timbre2: 8
+      timbre3: 12
+      timbre4: 15
+      timbre5: 29
+melody_sources:
+  - id: gen-random-none
+    name: Random Generation
+    desc: "Melody is generated without any structure or reference."
+  - id: sample-note-kising
+    name: Sampled Melody (KiSing)
+    desc: "Melody is retrieved from KiSing dataset."
+  - id: sample-note-touhou
+    name: Sampled Melody (Touhou)
+    desc: "Melody is retrieved from Touhou dataset."
+  - id: sample-lyric-kising
+    name: Sampled Melody with Lyrics (Kising)
+    desc: "Melody with aligned lyrics are sampled from Kising dataset."

data/{song2note_lengths.json → kising/song2note_lengths.json} RENAMED Viewed

File without changes

data/{song2word_lengths.json → kising/song2word_lengths.json} RENAMED Viewed

File without changes

data_handlers/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import importlib
+import pkgutil
+from pathlib import Path
+from .base import MelodyDatasetHandler
+_registry = {}
+for _, module_name, _ in pkgutil.iter_modules([str(Path(__file__).parent)]):
+    if module_name in ("__init__", "base"):
+        continue
+    module = importlib.import_module(f"{__name__}.{module_name}")
+    for attr_name in dir(module):
+        attr = getattr(module, attr_name)
+        if (
+            isinstance(attr, type)
+            and issubclass(attr, MelodyDatasetHandler)
+            and attr is not MelodyDatasetHandler
+        ):
+            _registry[attr.name] = attr  # 注册 class 本身
+def get_melody_handler(name: str) -> type[MelodyDatasetHandler]:
+    if name not in _registry:
+        raise ValueError(f"Melody source '{name}' not found")
+    return _registry[name]

data_handlers/base.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from abc import ABC, abstractmethod
+class MelodyDatasetHandler(ABC):
+    name: str
+    @abstractmethod
+    def __init__(self, *args, **kwargs):
+        pass
+    @abstractmethod
+    def get_song_ids(self) -> list[str]:
+        pass
+    @abstractmethod
+    def get_phrase_length(self, song_id):
+        pass
+    @abstractmethod
+    def iter_song_phrases(self, song_id):
+        pass

data_handlers/kising.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from .base import MelodyDatasetHandler
+class KiSing(MelodyDatasetHandler):
+    name = "kising"
+    def __init__(self, melody_type, cache_dir, *args, **kwargs):
+        # melody_type: support alignment type for "sample" melody source
+        import json
+        from datasets import load_dataset
+        song_db = load_dataset(
+            "jhansss/kising_score_segments", cache_dir=cache_dir, split="train"
+        ).to_pandas()
+        song_db.set_index("segment_id", inplace=True)
+        assert (
+            song_db.index.is_unique
+        ), "KiSing score segments should have unique segment_id."
+        if melody_type == "lyric":
+            with open("data/kising/song2word_lengths.json", "r") as f:
+                song2word_lengths = json.load(f)
+        elif melody_type == "note":
+            with open("data/kising/song2note_lengths.json", "r") as f:
+                song2word_lengths = json.load(f)
+        self.song_db = song_db
+        self.song2word_lengths = song2word_lengths
+    def get_song_ids(self):
+        return list(self.song2word_lengths.keys())
+    def get_phrase_length(self, song_id):
+        return self.song2word_lengths[song_id]
+    def iter_song_phrases(self, song_id):
+        segment_id = 1
+        while f"{song_id}_{segment_id:03d}" in self.song_db.index:
+            segment = self.song_db.loc[f"{song_id}_{segment_id:03d}"].to_dict()
+            segment["note_lyrics"] = [
+                lyric.strip("<>") if lyric in ["<AP>", "<SP>"] else lyric
+                for lyric in segment["note_lyrics"]
+            ]
+            yield segment
+            segment_id += 1

data_handlers/touhou.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from .base import MelodyDatasetHandler
+class Touhou(MelodyDatasetHandler):
+    name = "touhou"
+    def __init__(self, melody_type, *args, **kwargs):
+        if melody_type != "note":
+            raise ValueError(
+                f"Touhou dataset only contains note annotations. {melody_type} is not supported."
+            )
+        import json
+        with open("data/touhou/note_data.json", "r", encoding="utf-8") as f:
+            song_db = json.load(f)
+        song_db = {song["name"]: song for song in song_db}
+        self.song_db = song_db
+    def get_song_ids(self):
+        return list(self.song_db.keys())
+    def get_phrase_length(self, song_id):
+        # touhou score does not have phrase segmentation
+        return None
+    def iter_song_phrases(self, song_id):
+        song = self.song_db[song_id]
+        song = {
+            "tempo": song["tempo"],
+            "note_start_times": [n[0] * (100 / song["tempo"]) for n in song["score"]],
+            "note_end_times": [n[1] * (100 / song["tempo"]) for n in song["score"]],
+            "note_lyrics": ["" for n in song["score"]],
+            "note_midi": [n[2] for n in song["score"]],
+        }
+        # touhou score does not have phrase segmentation
+        yield song

svs_eval.py → evaluation/svs_eval.py RENAMED Viewed

@@ -1,42 +1,52 @@
 import librosa
 import numpy as np
 import torch
-def singmos_warmup():
-    predictor = torch.hub.load(
         "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
     )
-    return predictor
-def singmos_evaluation(predictor, wav_info, fs):
-    wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
-    wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
-    len_mos = torch.tensor([wav_mos.shape[1]])
-    score = predictor(wav_mos, len_mos)
-    return score
-def initialize_audiobox_predictor():
     from audiobox_aesthetics.infer import initialize_predictor
     predictor = initialize_predictor()
     return predictor
-def audiobox_aesthetics_evaluation(predictor, audio_path):
-    score = predictor.forward([{"path": str(audio_path)}])
-    return score
-def score_extract_warmpup():
-    from basic_pitch.inference import predict
-    return predict
-def score_metric_evaluation(score_extractor, audio_path):
-    model_output, midi_data, note_events = score_extractor(audio_path)
     metrics = {}
     assert (
         len(midi_data.instruments) == 1
@@ -61,51 +71,64 @@ def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
     return np.mean(dissonant) if intervals else np.nan
-if __name__ == "__main__":
-    import argparse
-    from pathlib import Path
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--wav_path",
-        type=Path,
-        help="Path to the wav file",
-    )
-    parser.add_argument(
-        "--results_csv",
-        type=Path,
-        help="csv file to save the results",
-    )
-    args = parser.parse_args()
-    args.results_csv.parent.mkdir(parents=True, exist_ok=True)
-    y, fs = librosa.load(args.wav_path, sr=None)
-    # warmup
-    predictor = singmos_warmup()
-    score_extractor = score_extract_warmpup()
-    aesthetic_predictor = initialize_audiobox_predictor()
-    # evaluate the audio
-    metrics = {}
-    # singmos evaluation
-    score = singmos_evaluation(predictor, y, fs)
-    metrics["singmos"] = score
-    # score metric evaluation
-    score_results = score_metric_evaluation(score_extractor, args.wav_path)
-    metrics.update(score_results)
-    # audiobox aesthetics evaluation
-    score_results = audiobox_aesthetics_evaluation(aesthetic_predictor, args.wav_path)
-    metrics.update(score_results[0])
-    # save results
     with open(args.results_csv, "a") as f:
-        header = "file," + ",".join(metrics.keys()) + "\n"
         if f.tell() == 0:
             f.write(header)
         else:
@@ -113,8 +136,7 @@ if __name__ == "__main__":
                 file_header = f2.readline()
             if file_header != header:
                 raise ValueError(f"Header mismatch: {file_header} vs {header}")
         line = (
-            ",".join([str(args.wav_path)] + [str(v) for v in metrics.values()]) + "\n"
         )
         f.write(line)

 import librosa
+import soundfile as sf
 import numpy as np
 import torch
+import uuid
+from pathlib import Path
+# ----------- Initialization -----------
+def init_singmos():
+    print("[Init] Loading SingMOS...")
+    return torch.hub.load(
         "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
     )
+def init_basic_pitch():
+    print("[Init] Loading BasicPitch...")
+    from basic_pitch.inference import predict
+    return predict
+def init_per():
+    return None  # TODO: implement PER evaluation
+def init_audiobox_aesthetics():
+    print("[Init] Loading AudioboxAesthetics...")
     from audiobox_aesthetics.infer import initialize_predictor
     predictor = initialize_predictor()
     return predictor
+# ----------- Evaluation -----------
+def eval_singmos(audio_array, sr, predictor):
+    wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
+    wav_tensor = torch.from_numpy(wav).unsqueeze(0)
+    length_tensor = torch.tensor([wav_tensor.shape[1]])
+    score = predictor(wav_tensor, length_tensor)
+    return {"singmos": float(score)}
+def eval_melody_metrics(audio_path, pitch_extractor):
+    model_output, midi_data, note_events = pitch_extractor(audio_path)
     metrics = {}
     assert (
         len(midi_data.instruments) == 1
     return np.mean(dissonant) if intervals else np.nan
+def eval_per(audio_array, sr, model=None):
+    # TODO: implement PER evaluation
+    return {}
+def eval_aesthetic(audio_path, predictor):
+    score = predictor.forward([{"path": str(audio_path)}])
+    return {"aesthetic": float(score)}
+# ----------- Main Function -----------
+def load_evaluators(config):
+    loaded = {}
+    if "singmos" in config:
+        loaded["singmos"] = init_singmos()
+    if "melody" in config:
+        loaded["melody"] = init_basic_pitch()
+    if "per" in config:
+        loaded["per"] = init_per()
+    if "aesthetic" in config:
+        loaded["aesthetic"] = init_audiobox_aesthetics()
+    return loaded
+def run_evaluation(audio_array, sr, evaluators):
+    results = {}
+    if "singmos" in evaluators:
+        results.update(eval_singmos(audio_array, sr, evaluators["singmos"]))
+    if "per" in evaluators:
+        results.update(eval_per(audio_array, sr, evaluators["per"]))
+    # create a tmp file with unique name
+    tmp_path = Path(".tmp") / f"{uuid.uuid4()}.wav"
+    sf.write(tmp_path, audio_array, sr)
+    if "melody" in evaluators:
+        results.update(eval_melody_metrics(tmp_path, evaluators["melody"]))
+    if "aesthetic" in evaluators:
+        results.update(eval_aesthetic(tmp_path, evaluators["aesthetic"]))
+    tmp_path.unlink()
+    return results
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--wav_path", type=str, required=True)
+    parser.add_argument("--results_csv", type=str, required=True)
+    parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
+    args = parser.parse_args()
+    audio_array, sr = librosa.load(args.wav_path, sr=None)
+    evaluators = load_evaluators(args.evaluators.split(","))
+    results = run_evaluation(audio_array, sr, evaluators)
+    print(results)
     with open(args.results_csv, "a") as f:
+        header = "file," + ",".join(results.keys()) + "\n"
         if f.tell() == 0:
             f.write(header)
         else:
                 file_header = f2.readline()
             if file_header != header:
                 raise ValueError(f"Header mismatch: {file_header} vs {header}")
         line = (
+            ",".join([str(args.wav_path)] + [str(v) for v in results.values()]) + "\n"
         )
         f.write(line)

interface.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import gradio as gr
+import yaml
+from characters import CHARACTERS
+from pipeline import SingingDialoguePipeline
+class GradioInterface:
+    def __init__(self, options_config: str, default_config: str):
+        self.options = self.load_config(options_config)
+        self.svs_model_map = {
+            model["id"]: model for model in self.options["svs_models"]
+        }
+        self.default_config = self.load_config(default_config)
+        self.character_info = CHARACTERS
+        self.current_character = self.default_config["character"]
+        self.current_svs_model = (
+            f"{self.default_config['language']}-{self.default_config['svs_model']}"
+        )
+        self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
+            self.character_info[self.current_character].default_timbre
+        ]
+        self.pipeline = SingingDialoguePipeline(self.default_config)
+    def load_config(self, path: str):
+        with open(path, "r") as f:
+            return yaml.safe_load(f)
+    def create_interface(self) -> gr.Blocks:
+        try:
+            with gr.Blocks(title="SingingSDS") as demo:
+                gr.Markdown("# SingingSDS: Role-Playing Singing Spoken Dialogue System")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        character_image = gr.Image(
+                            self.character_info[self.current_character].image_path,
+                            label="Character",
+                            show_label=False,
+                        )
+                    with gr.Column(scale=2):
+                        mic_input = gr.Audio(
+                            sources=["microphone", "upload"],
+                            type="filepath",
+                            label="Speak to the character",
+                        )
+                        interaction_log = gr.Textbox(
+                            label="Interaction Log", lines=3, interactive=False
+                        )
+                        audio_output = gr.Audio(
+                            label="Character's Response", type="filepath", autoplay=True
+                        )
+                        with gr.Row():
+                            metrics_button = gr.Button(
+                                "Evaluate Metrics", variant="secondary"
+                            )
+                            metrics_output = gr.Textbox(
+                                label="Evaluation Results", lines=3, interactive=False
+                            )
+                gr.Markdown("## Configuration")
+                with gr.Row():
+                    with gr.Column():
+                        character_radio = gr.Radio(
+                            label="Character Role",
+                            choices=list(self.character_info.keys()),
+                            value=self.default_config["character"],
+                        )
+                        with gr.Row():
+                            asr_radio = gr.Radio(
+                                label="ASR Model",
+                                choices=[
+                                    (model["name"], model["id"])
+                                    for model in self.options["asr_models"]
+                                ],
+                                value=self.default_config["asr_model"],
+                            )
+                        with gr.Row():
+                            llm_radio = gr.Radio(
+                                label="LLM Model",
+                                choices=[
+                                    (model["name"], model["id"])
+                                    for model in self.options["llm_models"]
+                                ],
+                                value=self.default_config["llm_model"],
+                            )
+                    with gr.Column():
+                        with gr.Row():
+                            melody_radio = gr.Radio(
+                                label="Melody Source",
+                                choices=[
+                                    (source["name"], source["id"])
+                                    for source in self.options["melody_sources"]
+                                ],
+                                value=self.default_config["melody_source"],
+                            )
+                        with gr.Row():
+                            svs_radio = gr.Radio(
+                                label="SVS Model",
+                                choices=[
+                                    (model["name"], model["id"])
+                                    for model in self.options["svs_models"]
+                                ],
+                                value=self.current_svs_model,
+                            )
+                        with gr.Row():
+                            timbre_radio = gr.Radio(
+                                label="Singing Timbre",
+                                choices=list(
+                                    self.svs_model_map[self.current_svs_model][
+                                        "embeddings"
+                                    ].keys()
+                                ),
+                                value=self.character_info[
+                                    self.current_character
+                                ].default_timbre,
+                            )
+                character_radio.change(
+                    fn=self.update_character,
+                    inputs=character_radio,
+                    outputs=[character_image, timbre_radio],
+                )
+                asr_radio.change(
+                    fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
+                )
+                llm_radio.change(
+                    fn=self.update_llm_model, inputs=llm_radio, outputs=llm_radio
+                )
+                svs_radio.change(
+                    fn=self.update_svs_model,
+                    inputs=svs_radio,
+                    outputs=[svs_radio, timbre_radio],
+                )
+                melody_radio.change(
+                    fn=self.update_melody_source,
+                    inputs=melody_radio,
+                    outputs=melody_radio,
+                )
+                timbre_radio.change(
+                    fn=self.update_timbre, inputs=timbre_radio, outputs=timbre_radio
+                )
+                mic_input.change(
+                    fn=self.run_pipeline,
+                    inputs=mic_input,
+                    outputs=[interaction_log, audio_output],
+                )
+            return demo
+        except Exception as e:
+            print(f"error: {e}")
+            breakpoint()
+    def update_character(self, character):
+        self.current_character = character
+        character_timbre = self.character_info[self.current_character].default_timbre
+        self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
+            character_timbre
+        ]
+        return gr.update(value=self.character_info[character].image_path), gr.update(
+            value=character_timbre
+        )
+    def update_asr_model(self, asr_model):
+        self.pipeline.set_asr_model(asr_model)
+        return gr.update(value=asr_model)
+    def update_llm_model(self, llm_model):
+        self.pipeline.set_llm_model(llm_model)
+        return gr.update(value=llm_model)
+    def update_svs_model(self, svs_model):
+        self.current_svs_model = svs_model
+        character_timbre = self.character_info[self.current_character].default_timbre
+        self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
+            character_timbre
+        ]
+        self.pipeline.set_svs_model(
+            self.svs_model_map[self.current_svs_model]["model_path"]
+        )
+        print(
+            f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and timbre_radio to {character_timbre}"
+        )
+        return (
+            gr.update(value=svs_model),
+            gr.update(
+                choices=list(
+                    self.svs_model_map[self.current_svs_model]["embeddings"].keys()
+                ),
+                value=character_timbre,
+            ),
+        )
+    def update_melody_source(self, melody_source):
+        self.current_melody_source = melody_source
+        return gr.update(value=self.current_melody_source)
+    def update_timbre(self, timbre):
+        self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
+            timbre
+        ]
+        return gr.update(value=timbre)
+    def run_pipeline(self, audio_path):
+        results = self.pipeline.run(
+            audio_path,
+            self.svs_model_map[self.current_svs_model]["lang"],
+            self.character_info[self.current_character].prompt,
+            svs_inference_kwargs={
+                "speaker": self.current_timbre,
+            },
+            max_new_tokens=100,
+        )
+        formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
+        return gr.update(value=formatted_logs), gr.update(value=results["svs_audio"])
+    def run_evaluation(self, audio, audio_sample_rate):
+        pass

modules/asr.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from __future__ import annotations
+from abc import ABC, abstractmethod
+import librosa
+import numpy as np
+from transformers import pipeline
+ASR_MODEL_REGISTRY = {}
+class AbstractASRModel(ABC):
+    @abstractmethod
+    def __init__(
+        self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
+    ):
+        self.model_id = model_id
+        self.device = device
+        self.cache_dir = cache_dir
+        pass
+    @abstractmethod
+    def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
+        pass
+def register_asr_model(prefix):
+    def wrapper(cls):
+        assert issubclass(cls, AbstractASRModel), f"{cls} must inherit AbstractASRModel"
+        ASR_MODEL_REGISTRY[prefix] = cls
+        return cls
+    return wrapper
+def get_asr_model(model_id: str, device="cpu", **kwargs) -> AbstractASRModel:
+    for prefix, cls in ASR_MODEL_REGISTRY.items():
+        if model_id.startswith(prefix):
+            return cls(model_id, device=device, **kwargs)
+    raise ValueError(f"No ASR wrapper found for model: {model_id}")
+@register_asr_model("openai/whisper")
+class WhisperASR(AbstractASRModel):
+    def __init__(
+        self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
+    ):
+        super().__init__(model_id, device, cache_dir, **kwargs)
+        model_kwargs = kwargs.setdefault("model_kwargs", {})
+        model_kwargs["cache_dir"] = cache_dir
+        self.pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model_id,
+            device=0 if device == "cuda" else -1,
+            **kwargs,
+        )
+    def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
+        if audio_sample_rate != 16000:
+            try:
+                audio, _ = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
+            except Exception as e:
+                breakpoint()
+                print(f"Error resampling audio: {e}")
+                audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
+        return self.pipe(audio, generate_kwargs={"language": language}).get("text", "")

modules/llm.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from abc import ABC, abstractmethod
+from transformers import pipeline
+LLM_MODEL_REGISTRY = {}
+class AbstractLLMModel(ABC):
+    @abstractmethod
+    def __init__(
+        self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
+    ): ...
+    @abstractmethod
+    def generate(self, prompt: str, **kwargs) -> str:
+        pass
+def register_llm_model(prefix: str):
+    def wrapper(cls):
+        assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel"
+        LLM_MODEL_REGISTRY[prefix] = cls
+        return cls
+    return wrapper
+def get_llm_model(model_id: str, device="cpu", **kwargs) -> AbstractLLMModel:
+    for prefix, cls in LLM_MODEL_REGISTRY.items():
+        if model_id.startswith(prefix):
+            return cls(model_id, device=device, **kwargs)
+    raise ValueError(f"No LLM wrapper found for model: {model_id}")
+@register_llm_model("google/gemma")
+@register_llm_model("tii/")  # e.g., Falcon
+@register_llm_model("meta-llama")
+class HFTextGenerationLLM(AbstractLLMModel):
+    def __init__(
+        self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
+    ):
+        model_kwargs = kwargs.setdefault("model_kwargs", {})
+        model_kwargs["cache_dir"] = cache_dir
+        self.pipe = pipeline(
+            "text-generation",
+            model=model_id,
+            device=0 if device == "cuda" else -1,
+            return_full_text=False,
+            **kwargs,
+        )
+    def generate(self, prompt: str, **kwargs) -> str:
+        outputs = self.pipe(prompt, **kwargs)
+        return outputs[0]["generated_text"] if outputs else ""

modules/melody.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import random
+from typing import Iterator
+from data_handlers import get_melody_handler
+from .utils.g2p import preprocess_text
+class MelodyController:
+    def __init__(self, melody_source_id: str, cache_dir: str):
+        self.melody_source_id = melody_source_id
+        self.song_id = None
+        # load song database if needed
+        parts = self.melody_source_id.split("-")
+        self.mode = parts[0]
+        self.align_type = parts[1]
+        dataset_name = parts[-1]
+        if dataset_name == "none":
+            self.database = None
+        else:
+            handler_cls = get_melody_handler(dataset_name)
+            self.database = handler_cls(self.align_type, cache_dir)
+    def get_melody_constraints(self, max_num_phrases: int = 5) -> str:
+        """Return a lyric-format prompt based on melody structure."""
+        if self.mode == "gen":
+            return ""
+        elif self.mode == "sample":
+            assert self.database is not None, "Song database is not loaded."
+            self.song_id = random.choice(self.database.get_song_ids())
+            self.reference_song = self.database.iter_song_phrases(self.song_id)
+            phrase_length = self.database.get_phrase_length(self.song_id)
+            if not phrase_length:
+                return ""
+            prompt = (
+                "\n请按照歌词格式回答我的问题，每句需遵循以下字数规则："
+                + "".join(
+                    [
+                        f"\n第{i}句：{c}个字"
+                        for i, c in enumerate(phrase_length[:max_num_phrases], 1)
+                    ]
+                )
+                + "\n如果没有足够的信息回答，请使用最少的句子，不要重复、不要扩展、不要加入无关内容。\n"
+            )
+            return prompt
+        else:
+            raise ValueError(f"Unsupported melody mode: {self.mode}")
+    def generate_score(
+        self, lyrics: str, language: str
+    ) -> list[tuple[float, float, str, int]]:
+        """
+        lyrics: [lyric, ...]
+        returns: [(start, end, lyric, pitch), ...]
+        """
+        text_list = preprocess_text(lyrics, language)
+        if self.mode == "gen" and self.align_type == "random":
+            return self._generate_random_score(text_list)
+        elif self.mode == "sample":
+            if not self.reference_song:
+                raise RuntimeError(
+                    "Must call get_melody_constraints() before generate_score() in sample mode."
+                )
+            return self._align_text_to_score(
+                text_list, self.reference_song, self.align_type
+            )
+        else:
+            raise ValueError(f"Unsupported melody_source_id: {self.melody_source_id}")
+    def _generate_random_score(self, text_list: list[str]):
+        st = 0
+        score = []
+        for lyric in text_list:
+            pitch = random.randint(57, 69)
+            duration = round(random.uniform(0.1, 0.5), 4)
+            ed = st + duration
+            score.append((st, ed, lyric, pitch))
+            st = ed
+        return score
+    def _align_text_to_score(
+        self,
+        text_list: list[str],
+        song_phrase_iterator: Iterator[dict],
+        align_type: str,
+    ):
+        score = []
+        text_idx = 0
+        while text_idx < len(text_list):
+            reference = next(song_phrase_iterator)
+            for st, ed, ref_lyric, pitch in zip(
+                reference["note_start_times"],
+                reference["note_end_times"],
+                reference["note_lyrics"],
+                reference["note_midi"],
+            ):
+                assert ref_lyric not in [
+                    "<AP>",
+                    "<SP>",
+                ], f"Proccessed {self.melody_source_id} score segments should not contain <AP> or <SP>."  # TODO: remove in PR, only for debug
+                if pitch == 0:
+                    score.append((st, ed, ref_lyric, pitch))
+                elif ref_lyric in ["-", "——"] and align_type == "lyric":
+                    score.append((st, ed, ref_lyric, pitch))
+                    text_idx += 1
+                else:
+                    score.append((st, ed, text_list[text_idx], pitch))
+                    text_idx += 1
+        return score

modules/svs/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .base import AbstractSVSModel
+from .registry import SVS_MODEL_REGISTRY, get_svs_model, register_svs_model
+from .espnet import ESPNetSVS
+__all__ = [
+    "AbstractSVSModel",
+    "get_svs_model",
+    "register_svs_model",
+    "SVS_MODEL_REGISTRY",
+]

modules/svs/base.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from abc import ABC, abstractmethod
+import numpy as np
+class AbstractSVSModel(ABC):
+    @abstractmethod
+    def __init__(
+        self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
+    ): ...
+    @abstractmethod
+    def synthesize(
+        self,
+        score: list[tuple[float, float, str, int]],
+        **kwargs,
+    ) -> tuple[np.ndarray, int]:
+        """
+        Synthesize singing audio from music score.
+        """
+        pass

modules/svs/espnet.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from typing import Callable
+import numpy as np
+from modules.utils.g2p import (
+    kana_to_phonemes_openjtalk,
+    pinyin_to_phonemes_ace,
+    pinyin_to_phonemes_opencpop,
+)
+from .base import AbstractSVSModel
+from .registry import register_svs_model
+@register_svs_model("espnet/")
+class ESPNetSVS(AbstractSVSModel):
+    def __init__(self, model_id: str, device="cpu", cache_dir="cache", **kwargs):
+        from espnet2.bin.svs_inference import SingingGenerate
+        from espnet_model_zoo.downloader import ModelDownloader
+        print(f"Downloading {model_id} to {cache_dir}") # TODO: should improve log code
+        downloaded = ModelDownloader(cache_dir).download_and_unpack(model_id)
+        print(f"Downloaded {model_id} to {cache_dir}") # TODO: should improve log code
+        self.model = SingingGenerate(
+            train_config=downloaded["train_config"],
+            model_file=downloaded["model_file"],
+            device=device,
+        )
+        self.model_id = model_id
+        self.output_sample_rate = self.model.fs
+        self.phoneme_mappers = self._build_phoneme_mappers()
+    def _build_phoneme_mappers(self) -> dict[str, Callable[[str], list[str]]]:
+        if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
+            phoneme_mappers = {
+                "mandarin": pinyin_to_phonemes_opencpop,
+            }
+        elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
+            def mandarin_mapper(pinyin: str) -> list[str]:
+                phns = pinyin_to_phonemes_ace(pinyin)
+                return [phn + "@zh" for phn in phns]
+            def japanese_mapper(kana: str) -> list[str]:
+                phones = kana_to_phonemes_openjtalk(kana)
+                return [phn + "@jp" for phn in phones]
+            phoneme_mappers = {
+                "mandarin": mandarin_mapper,
+                "japanese": japanese_mapper,
+            }
+        else:
+            phoneme_mappers = {}
+        return phoneme_mappers
+    def _preprocess(self, score: list[tuple[float, float, str, int]], language: str):
+        if language not in self.phoneme_mappers:
+            raise ValueError(f"Unsupported language: {language} for {self.model_id}")
+        phoneme_mapper = self.phoneme_mappers[language]
+        # text to phoneme
+        notes = []
+        phns = []
+        pre_phn = None
+        for st, ed, text, pitch in score:
+            assert text not in [
+                "<AP>",
+                "<SP>",
+            ], f"Proccessed score segments should not contain <AP> or <SP>. {score}"  # TODO: remove in PR, only for debug
+            if text == "AP" or text == "SP":
+                lyric_units = [text]
+                phn_units = [text]
+            elif text == "-" or text == "——":
+                lyric_units = [text]
+                if pre_phn is None:
+                    raise ValueError(
+                        f"Text `{text}` cannot be recognized by {self.model_id}. Lyrics cannot start with a lyric continuation symbol `-` or `——`"
+                    )
+                phn_units = [pre_phn]
+            else:
+                try:
+                    lyric_units = phoneme_mapper(text)
+                except ValueError as e:
+                    raise ValueError(
+                        f"Text `{text}` cannot be recognized by {self.model_id}"
+                    ) from e
+                phn_units = lyric_units
+            notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units)))
+            phns.extend(phn_units)
+            pre_phn = phn_units[-1]
+        batch = {
+            "score": {
+                "tempo": 120,  # does not affect svs result, as note durations are in time unit
+                "notes": notes,
+            },
+            "text": " ".join(phns),
+        }
+        return batch
+    def synthesize(
+        self, score: list[tuple[float, float, str, int]], language: str, **kwargs
+    ):
+        batch = self._preprocess(score, language)
+        if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
+            sid = np.array([int(kwargs["speaker"])])
+            output_dict = self.model(batch, sids=sid)
+        elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
+            langs = {
+                "zh": 2,
+                "jp": 1,
+            }
+            if language not in langs:
+                raise ValueError(
+                    f"Unsupported language: {language} for {self.model_id}"
+                )
+            lid = np.array([langs[language]])
+            spk_embed = np.load(kwargs["speaker"])
+            output_dict = self.model(batch, lids=lid, spembs=spk_embed)
+        else:
+            raise NotImplementedError(f"Model {self.model_id} not supported")
+        wav_info = output_dict["wav"].cpu().numpy()
+        return wav_info, self.output_sample_rate

modules/svs/registry.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .base import AbstractSVSModel
+SVS_MODEL_REGISTRY = {}
+def register_svs_model(prefix: str):
+    def wrapper(cls):
+        assert issubclass(cls, AbstractSVSModel), f"{cls} must inherit AbstractSVSModel"
+        SVS_MODEL_REGISTRY[prefix] = cls
+        return cls
+    return wrapper
+def get_svs_model(model_id: str, device="cpu", **kwargs) -> AbstractSVSModel:
+    for prefix, cls in SVS_MODEL_REGISTRY.items():
+        if model_id.startswith(prefix):
+            return cls(model_id, device=device, **kwargs)
+    raise ValueError(f"No SVS wrapper found for model: {model_id}")

modules/utils/g2p.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import json
+import re
+import warnings
+from pathlib import Path
+from kanjiconv import KanjiConv
+from pypinyin import lazy_pinyin
+from .resources.pinyin_dict import PINYIN_DICT
+kanji_to_kana = KanjiConv()
+yoon_map = {
+    "ぁ": "あ",
+    "ぃ": "い",
+    "ぅ": "う",
+    "ぇ": "え",
+    "ぉ": "お",
+    "ゃ": "や",
+    "ゅ": "ゆ",
+    "ょ": "よ",
+    "ゎ": "わ",
+}
+# ACE_phonemes
+with open(Path(__file__).parent / "resources" / "all_plans.json", "r") as f:
+    ace_phonemes_all_plans = json.load(f)
+for plan in ace_phonemes_all_plans["plans"]:
+    if plan["language"] == "zh":
+        ace_phonemes_zh_plan = plan
+        break
+def preprocess_text(text: str, language: str) -> list[str]:
+    if language == "mandarin":
+        text_list = to_pinyin(text)
+    elif language == "japanese":
+        text_list = to_kana(text)
+    else:
+        raise ValueError(f"Other languages are not supported")
+    return text_list
+def to_pinyin(text: str) -> list[str]:
+    pinyin_list = lazy_pinyin(text)
+    text_list = []
+    for text in pinyin_list:
+        if text[0] == "S" or text[0] == "A" or text[0] == "-":
+            sp_strs = re.findall(r"-|AP|SP", text)
+            for phn in sp_strs:
+                text_list.append(phn)
+        else:
+            text_list.append(text)
+    return text_list
+def replace_chouonpu(hiragana_text: str) -> str:
+    """process「ー」since the previous packages didn't support"""
+    vowels = {
+        "あ": "あ",
+        "い": "い",
+        "う": "う",
+        "え": "え",
+        "お": "う",
+        "か": "あ",
+        "き": "い",
+        "く": "う",
+        "け": "え",
+        "こ": "う",
+        "さ": "あ",
+        "し": "い",
+        "す": "う",
+        "せ": "え",
+        "そ": "う",
+        "た": "あ",
+        "ち": "い",
+        "つ": "う",
+        "て": "え",
+        "と": "う",
+        "な": "あ",
+        "に": "い",
+        "ぬ": "う",
+        "ね": "え",
+        "の": "う",
+        "は": "あ",
+        "ひ": "い",
+        "ふ": "う",
+        "へ": "え",
+        "ほ": "う",
+        "ま": "あ",
+        "み": "い",
+        "む": "う",
+        "め": "え",
+        "も": "う",
+        "や": "あ",
+        "ゆ": "う",
+        "よ": "う",
+        "ら": "あ",
+        "り": "い",
+        "る": "う",
+        "れ": "え",
+        "ろ": "う",
+        "わ": "あ",
+        "を": "う",
+    }
+    new_text = []
+    for i, char in enumerate(hiragana_text):
+        if char == "ー" and i > 0:
+            prev_char = new_text[-1]
+            if prev_char in yoon_map:
+                prev_char = yoon_map[prev_char]
+            new_text.append(vowels.get(prev_char, prev_char))
+        else:
+            new_text.append(char)
+    return "".join(new_text)
+def to_kana(text: str) -> list[str]:
+    hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", ""))
+    hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ")
+    final_ls = []
+    for subword in hiragana_text_wl:
+        sl_prev = 0
+        for i in range(len(subword) - 1):
+            if sl_prev >= len(subword) - 1:
+                break
+            sl = sl_prev + 1
+            if subword[sl] in yoon_map:
+                final_ls.append(subword[sl_prev : sl + 1])
+                sl_prev += 2
+            else:
+                final_ls.append(subword[sl_prev])
+                sl_prev += 1
+        final_ls.append(subword[sl_prev])
+    return final_ls
+def kana_to_phonemes_openjtalk(kana: str) -> list[str]:
+    import pyopenjtalk
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        # add space between each character
+        kana = " ".join(list(kana))
+        # phones is a str object separated by space
+        phones = pyopenjtalk.g2p(kana, kana=False)
+        if len(w) > 0:
+            for warning in w:
+                if "No phoneme" in str(warning.message):
+                    raise ValueError(f"No phoneme found for {kana}. {warning.message}")
+    phones = phones.split(" ")
+    return phones
+def pinyin_to_phonemes_opencpop(pinyin: str) -> list[str]:
+    pinyin = pinyin.lower()
+    if pinyin in ace_phonemes_zh_plan["dict"]:
+        phns = ace_phonemes_zh_plan["dict"][pinyin]
+        return phns
+    elif pinyin in ace_phonemes_zh_plan["syllable_alias"]:
+        phns = ace_phonemes_zh_plan["dict"][
+            ace_phonemes_zh_plan["syllable_alias"][pinyin]
+        ]
+        return phns
+    else:
+        raise ValueError(f"{pinyin} not registered in Opencpop phoneme dict")
+def pinyin_to_phonemes_ace(pinyin: str) -> list[str]:
+    pinyin = pinyin.lower()
+    if pinyin in PINYIN_DICT:
+        phns = PINYIN_DICT[pinyin]
+        return phns
+    else:
+        raise ValueError(f"{pinyin} not registered in ACE phoneme dict")

{resource → modules/utils/resources}/all_plans.json RENAMED Viewed

File without changes

{resource → modules/utils/resources}/pinyin_dict.py RENAMED Viewed

File without changes

modules/utils/text_normalize.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import re
+from typing import Optional
+def remove_non_zh_jp(text: str) -> str:
+    pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef]"
+    return re.sub(pattern, "", text)
+def truncate_sentences(text: str, max_sentences: int) -> str:
+    sentences = re.split(r"(?<=[。！？])", text)
+    return "".join(sentences[:max_sentences]).strip()
+def clean_llm_output(
+    text: str,
+    max_sentences: Optional[int] = 2,
+    seg_syb: str = " ",
+    language: str = "mandarin",
+) -> str:
+    if language not in ["mandarin", "japanese"]:
+        raise NotImplementedError(f"Unsupported language: {language}")
+    text = text.strip()
+    if max_sentences is not None:
+        text = truncate_sentences(text, max_sentences)
+    text = remove_non_zh_jp(text)
+    text = re.sub(r"[^\w\s\u4e00-\u9fff]", " ", text)  # Remove punctuation
+    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
+    text = text.replace("\n", seg_syb)
+    text = text.replace(" ", seg_syb)
+    return text

offline_process/create_features.py DELETED Viewed

@@ -1,71 +0,0 @@
-from datasets import load_dataset, concatenate_datasets
-ds = load_dataset("espnet/ace-kising-segments", cache_dir="cache")
-combined = concatenate_datasets([ds["train"], ds["validation"], ds["test"]])
-# 2. filter rows by singer: baber
-combined = combined.filter(lambda x: x["singer"] == "barber")
-# 3. create a new column, which counts the nonzero numbers in the list in the note_midi column
-combined = combined.map(
-    lambda x: {
-        "note_midi_length": len([n for n in x["note_midi"] if n != 0]),
-        "lyric_word_length": len(
-            [word for word in x["note_lyrics"] if word not in ["<AP>", "<SP>", "-"]]
-        ),  # counts the number of actual words (or characters for, e.g., Chinese/Japanese)
-    }
-)
-combined = combined.map(
-    lambda x: {
-        "lyric_word_length": len(
-            [word for word in x["note_lyrics"] if word not in ["<AP>", "<SP>", "-"]]
-        )
-    }  # counts the number of actual words (or characters for, e.g., Chinese/Japanese)
-)
-# 4. sort by segment_id
-combined = combined.sort("segment_id")
-# 5. iterate over rows
-prev_songid = None
-prev_song_segment_id = None
-song2note_lengths = {}
-song2word_lengths = {}
-for row in combined:
-    # segment_id: kising_barber_{songid}_{song_segment_id}
-    _, _, songid, song_segment_id = row["segment_id"].split("_")
-    if prev_songid != songid:
-        if prev_songid is not None:
-            assert (
-                song_segment_id == "001"
-            ), f"prev_songid: {prev_songid}, songid: {songid}, song_segment_id: {song_segment_id}"
-        song2note_lengths[f"kising_{songid}"] = [row["note_midi_length"]]
-        song2word_lengths[f"kising_{songid}"] = [row["lyric_word_length"]]
-    else:
-        assert (
-            int(song_segment_id) >= int(prev_song_segment_id) + 1
-        ), f"prev_song_segment_id: {prev_song_segment_id}, song_segment_id: {song_segment_id}"
-        song2note_lengths[f"kising_{songid}"].append(row["note_midi_length"])
-        song2word_lengths[f"kising_{songid}"].append(row["lyric_word_length"])
-    prev_songid = songid
-    prev_song_segment_id = song_segment_id
-# 6. write to json
-import json
-with open("data/song2note_lengths.json", "w") as f:
-    json.dump(song2note_lengths, f, indent=4)
-with open("data/song2word_lengths.json", "w") as f:
-    json.dump(song2word_lengths, f, indent=4)
-# 7. push score segments to hub
-# remove audio and singer columns
-combined = combined.remove_columns(["audio", "singer"])
-# replace kising_barber_ with kising_
-combined = combined.map(
-    lambda x: {"segment_id": x["segment_id"].replace("kising_barber_", "kising_")}
-)
-# upload to hub
-combined.push_to_hub("jhansss/kising_score_segments")

path.sh DELETED Viewed

@@ -1,3 +0,0 @@
-#!/bin/bash
-. ~/workspace/SingingSDS/activate_python.sh

pipeline.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+import time
+import librosa
+from modules.asr import get_asr_model
+from modules.llm import get_llm_model
+from modules.svs import get_svs_model
+from evaluation.svs_eval import load_evaluators, run_evaluation
+from modules.melody import MelodyController
+from modules.utils.text_normalize import clean_llm_output
+class SingingDialoguePipeline:
+    def __init__(self, config: dict):
+        if "device" in config:
+            self.device = config["device"]
+        else:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.cache_dir = config["cache_dir"]
+        self.asr = get_asr_model(
+            config["asr_model"], device=self.device, cache_dir=self.cache_dir
+        )
+        self.llm = get_llm_model(
+            config["llm_model"], device=self.device, cache_dir=self.cache_dir
+        )
+        self.svs = get_svs_model(
+            config["svs_model"], device=self.device, cache_dir=self.cache_dir
+        )
+        self.melody_controller = MelodyController(
+            config["melody_source"], self.cache_dir
+        )
+        self.track_latency = config.get("track_latency", False)
+        self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
+    def set_asr_model(self, asr_model: str):
+        self.asr = get_asr_model(
+            asr_model, device=self.device, cache_dir=self.cache_dir
+        )
+    def set_llm_model(self, llm_model: str):
+        self.llm = get_llm_model(
+            llm_model, device=self.device, cache_dir=self.cache_dir
+        )
+    def set_svs_model(self, svs_model: str):
+        self.svs = get_svs_model(
+            svs_model, device=self.device, cache_dir=self.cache_dir
+        )
+    def set_melody_controller(self, melody_source: str):
+        self.melody_controller = MelodyController(melody_source, self.cache_dir)
+    def run(
+        self,
+        audio_path,
+        language,
+        prompt_template,
+        svs_inference_kwargs,
+        max_new_tokens=100,
+    ):
+        if self.track_latency:
+            asr_start_time = time.time()
+        audio_array, audio_sample_rate = librosa.load(audio_path, sr=16000)
+        asr_result = self.asr.transcribe(
+            audio_array, audio_sample_rate=audio_sample_rate, language=language
+        )
+        if self.track_latency:
+            asr_end_time = time.time()
+            asr_latency = asr_end_time - asr_start_time
+        melody_prompt = self.melody_controller.get_melody_constraints()
+        prompt = prompt_template.format(melody_prompt, asr_result)
+        if self.track_latency:
+            llm_start_time = time.time()
+        output = self.llm.generate(prompt, max_new_tokens=max_new_tokens)
+        if self.track_latency:
+            llm_end_time = time.time()
+            llm_latency = llm_end_time - llm_start_time
+        print(f"llm output: {output}确认一下是不是不含prompt的")
+        llm_response = clean_llm_output(output, language=language)
+        score = self.melody_controller.generate_score(llm_response, language)
+        if self.track_latency:
+            svs_start_time = time.time()
+        singing_audio, sample_rate = self.svs.synthesize(
+            score, language=language, **svs_inference_kwargs
+        )
+        if self.track_latency:
+            svs_end_time = time.time()
+            svs_latency = svs_end_time - svs_start_time
+        results = {
+            "asr_text": asr_result,
+            "llm_text": llm_response,
+            "svs_audio": (singing_audio, sample_rate),
+        }
+        if self.track_latency:
+            results["metrics"].update({
+                "asr_latency": asr_latency,
+                "llm_latency": llm_latency,
+                "svs_latency": svs_latency,
+            })
+        return results
+    def evaluate(self, audio, sample_rate):
+        return run_evaluation(audio, sample_rate, self.evaluators)

{resource → resources}/__init__.py RENAMED Viewed

File without changes

resources/all_plans.json ADDED Viewed

The diff for this file is too large to render. See raw diff

{resource → resources}/midi-note.scp RENAMED Viewed

File without changes

resources/pinyin_dict.py ADDED Viewed

	@@ -0,0 +1,423 @@

+# Adapted from Opencpop's pinyin to phoneme mapping table:
+# https://wenet.org.cn/opencpop/resources/annotationformat/
+PINYIN_DICT = {
+    "a": ("a",),
+    "ai": ("ai",),
+    "an": ("an",),
+    "ang": ("ang",),
+    "ao": ("ao",),
+    "ba": ("b", "a"),
+    "bai": ("b", "ai"),
+    "ban": ("b", "an"),
+    "bang": ("b", "ang"),
+    "bao": ("b", "ao"),
+    "bei": ("b", "ei"),
+    "ben": ("b", "en"),
+    "beng": ("b", "eng"),
+    "bi": ("b", "i"),
+    "bian": ("b", "ian"),
+    "biao": ("b", "iao"),
+    "bie": ("b", "ie"),
+    "bin": ("b", "in"),
+    "bing": ("b", "ing"),
+    "bo": ("b", "o"),
+    "bu": ("b", "u"),
+    "ca": ("c", "a"),
+    "cai": ("c", "ai"),
+    "can": ("c", "an"),
+    "cang": ("c", "ang"),
+    "cao": ("c", "ao"),
+    "ce": ("c", "e"),
+    "cei": ("c", "ei"),
+    "cen": ("c", "en"),
+    "ceng": ("c", "eng"),
+    "cha": ("ch", "a"),
+    "chai": ("ch", "ai"),
+    "chan": ("ch", "an"),
+    "chang": ("ch", "ang"),
+    "chao": ("ch", "ao"),
+    "che": ("ch", "e"),
+    "chen": ("ch", "en"),
+    "cheng": ("ch", "eng"),
+    "chi": ("ch", "i"),
+    "chong": ("ch", "ong"),
+    "chou": ("ch", "ou"),
+    "chu": ("ch", "u"),
+    "chua": ("ch", "ua"),
+    "chuai": ("ch", "uai"),
+    "chuan": ("ch", "uan"),
+    "chuang": ("ch", "uang"),
+    "chui": ("ch", "ui"),
+    "chun": ("ch", "un"),
+    "chuo": ("ch", "uo"),
+    "ci": ("c", "i"),
+    "cong": ("c", "ong"),
+    "cou": ("c", "ou"),
+    "cu": ("c", "u"),
+    "cuan": ("c", "uan"),
+    "cui": ("c", "ui"),
+    "cun": ("c", "un"),
+    "cuo": ("c", "uo"),
+    "da": ("d", "a"),
+    "dai": ("d", "ai"),
+    "dan": ("d", "an"),
+    "dang": ("d", "ang"),
+    "dao": ("d", "ao"),
+    "de": ("d", "e"),
+    "dei": ("d", "ei"),
+    "den": ("d", "en"),
+    "deng": ("d", "eng"),
+    "di": ("d", "i"),
+    "dia": ("d", "ia"),
+    "dian": ("d", "ian"),
+    "diao": ("d", "iao"),
+    "die": ("d", "ie"),
+    "ding": ("d", "ing"),
+    "diu": ("d", "iu"),
+    "dong": ("d", "ong"),
+    "dou": ("d", "ou"),
+    "du": ("d", "u"),
+    "duan": ("d", "uan"),
+    "dui": ("d", "ui"),
+    "dun": ("d", "un"),
+    "duo": ("d", "uo"),
+    "e": ("e",),
+    "ei": ("ei",),
+    "en": ("en",),
+    "eng": ("eng",),
+    "er": ("er",),
+    "fa": ("f", "a"),
+    "fan": ("f", "an"),
+    "fang": ("f", "ang"),
+    "fei": ("f", "ei"),
+    "fen": ("f", "en"),
+    "feng": ("f", "eng"),
+    "fo": ("f", "o"),
+    "fou": ("f", "ou"),
+    "fu": ("f", "u"),
+    "ga": ("g", "a"),
+    "gai": ("g", "ai"),
+    "gan": ("g", "an"),
+    "gang": ("g", "ang"),
+    "gao": ("g", "ao"),
+    "ge": ("g", "e"),
+    "gei": ("g", "ei"),
+    "gen": ("g", "en"),
+    "geng": ("g", "eng"),
+    "gong": ("g", "ong"),
+    "gou": ("g", "ou"),
+    "gu": ("g", "u"),
+    "gua": ("g", "ua"),
+    "guai": ("g", "uai"),
+    "guan": ("g", "uan"),
+    "guang": ("g", "uang"),
+    "gui": ("g", "ui"),
+    "gun": ("g", "un"),
+    "guo": ("g", "uo"),
+    "ha": ("h", "a"),
+    "hai": ("h", "ai"),
+    "han": ("h", "an"),
+    "hang": ("h", "ang"),
+    "hao": ("h", "ao"),
+    "he": ("h", "e"),
+    "hei": ("h", "ei"),
+    "hen": ("h", "en"),
+    "heng": ("h", "eng"),
+    "hm": ("h", "m"),
+    "hng": ("h", "ng"),
+    "hong": ("h", "ong"),
+    "hou": ("h", "ou"),
+    "hu": ("h", "u"),
+    "hua": ("h", "ua"),
+    "huai": ("h", "uai"),
+    "huan": ("h", "uan"),
+    "huang": ("h", "uang"),
+    "hui": ("h", "ui"),
+    "hun": ("h", "un"),
+    "huo": ("h", "uo"),
+    "ji": ("j", "i"),
+    "jia": ("j", "ia"),
+    "jian": ("j", "ian"),
+    "jiang": ("j", "iang"),
+    "jiao": ("j", "iao"),
+    "jie": ("j", "ie"),
+    "jin": ("j", "in"),
+    "jing": ("j", "ing"),
+    "jiong": ("j", "iong"),
+    "jiu": ("j", "iu"),
+    "ju": ("j", "v"),
+    "juan": ("j", "van"),
+    "jue": ("j", "ve"),
+    "jun": ("j", "vn"),
+    "ka": ("k", "a"),
+    "kai": ("k", "ai"),
+    "kan": ("k", "an"),
+    "kang": ("k", "ang"),
+    "kao": ("k", "ao"),
+    "ke": ("k", "e"),
+    "kei": ("k", "ei"),
+    "ken": ("k", "en"),
+    "keng": ("k", "eng"),
+    "kong": ("k", "ong"),
+    "kou": ("k", "ou"),
+    "ku": ("k", "u"),
+    "kua": ("k", "ua"),
+    "kuai": ("k", "uai"),
+    "kuan": ("k", "uan"),
+    "kuang": ("k", "uang"),
+    "kui": ("k", "ui"),
+    "kun": ("k", "un"),
+    "kuo": ("k", "uo"),
+    "la": ("l", "a"),
+    "lai": ("l", "ai"),
+    "lan": ("l", "an"),
+    "lang": ("l", "ang"),
+    "lao": ("l", "ao"),
+    "le": ("l", "e"),
+    "lei": ("l", "ei"),
+    "leng": ("l", "eng"),
+    "li": ("l", "i"),
+    "lia": ("l", "ia"),
+    "lian": ("l", "ian"),
+    "liang": ("l", "iang"),
+    "liao": ("l", "iao"),
+    "lie": ("l", "ie"),
+    "lin": ("l", "in"),
+    "ling": ("l", "ing"),
+    "liu": ("l", "iu"),
+    "lo": ("l", "o"),
+    "long": ("l", "ong"),
+    "lou": ("l", "ou"),
+    "lu": ("l", "u"),
+    "luan": ("l", "uan"),
+    "lun": ("l", "un"),
+    "luo": ("l", "uo"),
+    "lv": ("l", "v"),
+    "lve": ("l", "ve"),
+    "m": ("m",),
+    "ma": ("m", "a"),
+    "mai": ("m", "ai"),
+    "man": ("m", "an"),
+    "mang": ("m", "ang"),
+    "mao": ("m", "ao"),
+    "me": ("m", "e"),
+    "mei": ("m", "ei"),
+    "men": ("m", "en"),
+    "meng": ("m", "eng"),
+    "mi": ("m", "i"),
+    "mian": ("m", "ian"),
+    "miao": ("m", "iao"),
+    "mie": ("m", "ie"),
+    "min": ("m", "in"),
+    "ming": ("m", "ing"),
+    "miu": ("m", "iu"),
+    "mo": ("m", "o"),
+    "mou": ("m", "ou"),
+    "mu": ("m", "u"),
+    "n": ("n",),
+    "na": ("n", "a"),
+    "nai": ("n", "ai"),
+    "nan": ("n", "an"),
+    "nang": ("n", "ang"),
+    "nao": ("n", "ao"),
+    "ne": ("n", "e"),
+    "nei": ("n", "ei"),
+    "nen": ("n", "en"),
+    "neng": ("n", "eng"),
+    "ng": ("n", "g"),
+    "ni": ("n", "i"),
+    "nian": ("n", "ian"),
+    "niang": ("n", "iang"),
+    "niao": ("n", "iao"),
+    "nie": ("n", "ie"),
+    "nin": ("n", "in"),
+    "ning": ("n", "ing"),
+    "niu": ("n", "iu"),
+    "nong": ("n", "ong"),
+    "nou": ("n", "ou"),
+    "nu": ("n", "u"),
+    "nuan": ("n", "uan"),
+    "nun": ("n", "un"),
+    "nuo": ("n", "uo"),
+    "nv": ("n", "v"),
+    "nve": ("n", "ve"),
+    "o": ("o",),
+    "ou": ("ou",),
+    "pa": ("p", "a"),
+    "pai": ("p", "ai"),
+    "pan": ("p", "an"),
+    "pang": ("p", "ang"),
+    "pao": ("p", "ao"),
+    "pei": ("p", "ei"),
+    "pen": ("p", "en"),
+    "peng": ("p", "eng"),
+    "pi": ("p", "i"),
+    "pian": ("p", "ian"),
+    "piao": ("p", "iao"),
+    "pie": ("p", "ie"),
+    "pin": ("p", "in"),
+    "ping": ("p", "ing"),
+    "po": ("p", "o"),
+    "pou": ("p", "ou"),
+    "pu": ("p", "u"),
+    "qi": ("q", "i"),
+    "qia": ("q", "ia"),
+    "qian": ("q", "ian"),
+    "qiang": ("q", "iang"),
+    "qiao": ("q", "iao"),
+    "qie": ("q", "ie"),
+    "qin": ("q", "in"),
+    "qing": ("q", "ing"),
+    "qiong": ("q", "iong"),
+    "qiu": ("q", "iu"),
+    "qu": ("q", "v"),
+    "quan": ("q", "van"),
+    "que": ("q", "ve"),
+    "qun": ("q", "vn"),
+    "ran": ("r", "an"),
+    "rang": ("r", "ang"),
+    "rao": ("r", "ao"),
+    "re": ("r", "e"),
+    "ren": ("r", "en"),
+    "reng": ("r", "eng"),
+    "ri": ("r", "i"),
+    "rong": ("r", "ong"),
+    "rou": ("r", "ou"),
+    "ru": ("r", "u"),
+    "rua": ("r", "ua"),
+    "ruan": ("r", "uan"),
+    "rui": ("r", "ui"),
+    "run": ("r", "un"),
+    "ruo": ("r", "uo"),
+    "sa": ("s", "a"),
+    "sai": ("s", "ai"),
+    "san": ("s", "an"),
+    "sang": ("s", "ang"),
+    "sao": ("s", "ao"),
+    "se": ("s", "e"),
+    "sen": ("s", "en"),
+    "seng": ("s", "eng"),
+    "sha": ("sh", "a"),
+    "shai": ("sh", "ai"),
+    "shan": ("sh", "an"),
+    "shang": ("sh", "ang"),
+    "shao": ("sh", "ao"),
+    "she": ("sh", "e"),
+    "shei": ("sh", "ei"),
+    "shen": ("sh", "en"),
+    "sheng": ("sh", "eng"),
+    "shi": ("sh", "i"),
+    "shou": ("sh", "ou"),
+    "shu": ("sh", "u"),
+    "shua": ("sh", "ua"),
+    "shuai": ("sh", "uai"),
+    "shuan": ("sh", "uan"),
+    "shuang": ("sh", "uang"),
+    "shui": ("sh", "ui"),
+    "shun": ("sh", "un"),
+    "shuo": ("sh", "uo"),
+    "si": ("s", "i"),
+    "song": ("s", "ong"),
+    "sou": ("s", "ou"),
+    "su": ("s", "u"),
+    "suan": ("s", "uan"),
+    "sui": ("s", "ui"),
+    "sun": ("s", "un"),
+    "suo": ("s", "uo"),
+    "ta": ("t", "a"),
+    "tai": ("t", "ai"),
+    "tan": ("t", "an"),
+    "tang": ("t", "ang"),
+    "tao": ("t", "ao"),
+    "te": ("t", "e"),
+    "tei": ("t", "ei"),
+    "teng": ("t", "eng"),
+    "ti": ("t", "i"),
+    "tian": ("t", "ian"),
+    "tiao": ("t", "iao"),
+    "tie": ("t", "ie"),
+    "ting": ("t", "ing"),
+    "tong": ("t", "ong"),
+    "tou": ("t", "ou"),
+    "tu": ("t", "u"),
+    "tuan": ("t", "uan"),
+    "tui": ("t", "ui"),
+    "tun": ("t", "un"),
+    "tuo": ("t", "uo"),
+    "wa": ("w", "a"),
+    "wai": ("w", "ai"),
+    "wan": ("w", "an"),
+    "wang": ("w", "ang"),
+    "wei": ("w", "ei"),
+    "wen": ("w", "en"),
+    "weng": ("w", "eng"),
+    "wo": ("w", "o"),
+    "wu": ("w", "u"),
+    "xi": ("x", "i"),
+    "xia": ("x", "ia"),
+    "xian": ("x", "ian"),
+    "xiang": ("x", "iang"),
+    "xiao": ("x", "iao"),
+    "xie": ("x", "ie"),
+    "xin": ("x", "in"),
+    "xing": ("x", "ing"),
+    "xiong": ("x", "iong"),
+    "xiu": ("x", "iu"),
+    "xu": ("x", "v"),
+    "xuan": ("x", "van"),
+    "xue": ("x", "ve"),
+    "xun": ("x", "vn"),
+    "ya": ("y", "a"),
+    "yan": ("y", "an"),
+    "yang": ("y", "ang"),
+    "yao": ("y", "ao"),
+    "ye": ("y", "e"),
+    "yi": ("y", "i"),
+    "yin": ("y", "in"),
+    "ying": ("y", "ing"),
+    "yo": ("y", "o"),
+    "yong": ("y", "ong"),
+    "you": ("y", "ou"),
+    "yu": ("y", "v"),
+    "yuan": ("y", "van"),
+    "yue": ("y", "ve"),
+    "yun": ("y", "vn"),
+    "za": ("z", "a"),
+    "zai": ("z", "ai"),
+    "zan": ("z", "an"),
+    "zang": ("z", "ang"),
+    "zao": ("z", "ao"),
+    "ze": ("z", "e"),
+    "zei": ("z", "ei"),
+    "zen": ("z", "en"),
+    "zeng": ("z", "eng"),
+    "zha": ("zh", "a"),
+    "zhai": ("zh", "ai"),
+    "zhan": ("zh", "an"),
+    "zhang": ("zh", "ang"),
+    "zhao": ("zh", "ao"),
+    "zhe": ("zh", "e"),
+    "zhei": ("zh", "ei"),
+    "zhen": ("zh", "en"),
+    "zheng": ("zh", "eng"),
+    "zhi": ("zh", "i"),
+    "zhong": ("zh", "ong"),
+    "zhou": ("zh", "ou"),
+    "zhu": ("zh", "u"),
+    "zhua": ("zh", "ua"),
+    "zhuai": ("zh", "uai"),
+    "zhuan": ("zh", "uan"),
+    "zhuang": ("zh", "uang"),
+    "zhui": ("zh", "ui"),
+    "zhun": ("zh", "un"),
+    "zhuo": ("zh", "uo"),
+    "zi": ("z", "i"),
+    "zong": ("z", "ong"),
+    "zou": ("z", "ou"),
+    "zu": ("z", "u"),
+    "zuan": ("z", "uan"),
+    "zui": ("z", "ui"),
+    "zun": ("z", "un"),
+    "zuo": ("z", "uo"),
+}

{resource → resources}/singer/singer_embedding_ace-1.npy RENAMED Viewed

File without changes

{resource → resources}/singer/singer_embedding_ace-10.npy RENAMED Viewed

File without changes

{resource → resources}/singer/singer_embedding_ace-11.npy RENAMED Viewed

File without changes

{resource → resources}/singer/singer_embedding_ace-12.npy RENAMED Viewed

File without changes

{resource → resources}/singer/singer_embedding_ace-13.npy RENAMED Viewed

File without changes

{resource → resources}/singer/singer_embedding_ace-14.npy RENAMED Viewed

File without changes

{resource → resources}/singer/singer_embedding_ace-15.npy RENAMED Viewed

File without changes

{resource → resources}/singer/singer_embedding_ace-16.npy RENAMED Viewed

File without changes

{resource → resources}/singer/singer_embedding_ace-17.npy RENAMED Viewed

File without changes

{resource → resources}/singer/singer_embedding_ace-18.npy RENAMED Viewed

File without changes

{resource → resources}/singer/singer_embedding_ace-19.npy RENAMED Viewed

File without changes