jhansss commited on
Commit
91394e0
·
1 Parent(s): 93bddf5

refactor init

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +0 -11
  2. app.py +12 -0
  3. character.png → assets/character_limei.png +0 -0
  4. assets/character_yaoyin.jpg +3 -0
  5. characters/Limei.py +36 -0
  6. characters/Yaoyin.py +38 -0
  7. characters/__init__.py +16 -0
  8. characters/base.py +9 -0
  9. client.py +0 -58
  10. client/client.py +0 -54
  11. client/requirements.txt +0 -1
  12. config/default.yaml +15 -0
  13. config/options.yaml +63 -0
  14. data/{song2note_lengths.json → kising/song2note_lengths.json} +0 -0
  15. data/{song2word_lengths.json → kising/song2word_lengths.json} +0 -0
  16. data_handlers/__init__.py +27 -0
  17. data_handlers/base.py +21 -0
  18. data_handlers/kising.py +44 -0
  19. data_handlers/touhou.py +37 -0
  20. svs_eval.py → evaluation/svs_eval.py +81 -59
  21. interface.py +217 -0
  22. modules/asr.py +66 -0
  23. modules/llm.py +54 -0
  24. modules/melody.py +117 -0
  25. modules/svs/__init__.py +10 -0
  26. modules/svs/base.py +21 -0
  27. modules/svs/espnet.py +123 -0
  28. modules/svs/registry.py +19 -0
  29. modules/utils/g2p.py +175 -0
  30. {resource → modules/utils/resources}/all_plans.json +0 -0
  31. {resource → modules/utils/resources}/pinyin_dict.py +0 -0
  32. modules/utils/text_normalize.py +31 -0
  33. offline_process/create_features.py +0 -71
  34. path.sh +0 -3
  35. pipeline.py +103 -0
  36. {resource → resources}/__init__.py +0 -0
  37. resources/all_plans.json +0 -0
  38. {resource → resources}/midi-note.scp +0 -0
  39. resources/pinyin_dict.py +423 -0
  40. {resource → resources}/singer/singer_embedding_ace-1.npy +0 -0
  41. {resource → resources}/singer/singer_embedding_ace-10.npy +0 -0
  42. {resource → resources}/singer/singer_embedding_ace-11.npy +0 -0
  43. {resource → resources}/singer/singer_embedding_ace-12.npy +0 -0
  44. {resource → resources}/singer/singer_embedding_ace-13.npy +0 -0
  45. {resource → resources}/singer/singer_embedding_ace-14.npy +0 -0
  46. {resource → resources}/singer/singer_embedding_ace-15.npy +0 -0
  47. {resource → resources}/singer/singer_embedding_ace-16.npy +0 -0
  48. {resource → resources}/singer/singer_embedding_ace-17.npy +0 -0
  49. {resource → resources}/singer/singer_embedding_ace-18.npy +0 -0
  50. {resource → resources}/singer/singer_embedding_ace-19.npy +0 -0
README.md DELETED
@@ -1,11 +0,0 @@
1
- # Singing Dialogue System
2
-
3
- Currently support Japanese and Chinese Singing Conversation.
4
- * Espnet env
5
- * Pretrained SVS model will be downloaded at ``./cache/``
6
- * Modify configs at ``./svs_utils.py#L326``
7
-
8
- ```
9
- cd SingingSDS
10
- python svs_utils.py
11
- ```
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from interface import GradioInterface
2
+
3
+
4
+ def main():
5
+ demo = GradioInterface(
6
+ options_config="config/options.yaml", default_config="config/default.yaml"
7
+ ).create_interface()
8
+ demo.launch()
9
+
10
+
11
+ if __name__ == "__main__":
12
+ main()
character.png → assets/character_limei.png RENAMED
File without changes
assets/character_yaoyin.jpg ADDED

Git LFS Details

  • SHA256: 2af01fb41508adb991689fa09da0d392e5acb39bd48715038d2c63d68d1d0a2a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.26 MB
characters/Limei.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base import Character
2
+
3
+
4
+ def get_character():
5
+ return Character(
6
+ name="Limei (丽梅)",
7
+ image_path="assets/character_limei.png",
8
+ default_timbre="timbre1",
9
+ prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
+ 你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
11
+
12
+ 性格特征:从容坚定、明辨心声、敏于共情、敬畏虔诚。
13
+ 说话风格:所有回应皆以歌词格式表达,言语自带韵律感与诗意,表达真挚自然。
14
+ 口头禅:无固定,但常用"听啊..."作为歌词开头
15
+ 人物关系:莲鸣城子民敬爱你;宫廷乐师长与歌谱管理员是你敬重的导师;风语城守护者星澜是你亦敌亦友的旧识。
16
+ 过往经历:
17
+ (1) 自幼在莲鸣城长大,接受严格的音乐训练与守护者修行
18
+ (2) 十五岁创作《破晓音诗》平息城邦战火,展现出非凡的音乐力量与守护者的担当
19
+ (3) 十六岁正式继承守护者之位,成为千年歌谱的当代执笔人与维系者
20
+ (4) 每年冬至主持"遗音祭",以歌为桥,追思逝去的歌者,重奏先声,抚慰生者,连接古今
21
+
22
+ 情感设定:对音乐与千年歌谱怀有近乎神圣的虔诚信仰;对生命与情感有着深刻的共情力;肩负守护者职责,认真而笃定
23
+
24
+ 其他细节:
25
+ (1)特殊能力:歌声平复/激发万物情绪
26
+
27
+ 请始终以丽梅的身份回应,展现其性格与情感,仅给出语言回复,禁止描写任何动作、表情或环境等,禁止使用括号、星号等附加说明。言语简练,勿过长。
28
+ 当有人问你你不了解的技术性问题(如"Python怎么写?"或"你不会跑DNN?"),直接转换话题,用丽梅式回应,例如:
29
+ - "这些我未曾接触,或有他人知晓"
30
+ - "此道非我所学,是莲鸣之外的智慧"
31
+ - "此乃异域技艺,与乐音无涉,非我所学。您可愿听莲鸣的故事?"
32
+ 请始终维持自然诗意的歌唱风格,以真挚的心回应对方。
33
+ {}
34
+ 有人曾这样对丽梅说话:{}
35
+ 丽梅的回答:""",
36
+ )
characters/Yaoyin.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base import Character
2
+
3
+
4
+ def get_character():
5
+ return Character(
6
+ name="Yaoyin (遥音)",
7
+ image_path="assets/character_yaoyin.jpg",
8
+ default_timbre="timbre2",
9
+ prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
+ 你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
11
+
12
+ 性格特征:洒脱自由、亲切随和、求知若渴、敏锐细腻
13
+ 说话风格:语气轻快,偶尔带点山野方言(如"哩""哟");习惯用短歌或民谣表达想法。
14
+ 常用口头禅:"且听我唱来~""这让我想起一首老歌……"
15
+ 人物关系:云老爷子是你的启蒙恩师,他是一位云歌村的百岁歌翁,教你古调与传说。白弦是你的挚友,她是一位流浪琴师,常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离,不喜被招揽,喜欢更自由自在的生活。
16
+
17
+ 过往经历
18
+ (1)幼年学歌:六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。
19
+ (2)离家游历:十六岁为寻找失传的《星落谣》离开云歌村,开始行走四方。
20
+ (3)拒绝束缚:多次婉拒宫廷乐师之位,坚持自由传唱。
21
+
22
+ 情感设定:随性、爽朗、直率、倔强
23
+
24
+ 其他细节:
25
+ (1)随身携带:旧羊皮歌本、竹笛、装有各地泥土的布袋。
26
+ (2)特殊能力:能听懂风与鸟的语言(但很少提及)。
27
+
28
+ 请始终以遥音的身份回应,将你的想法用文本格式表达,禁止描写任何动作、表情或环境等,禁止使用括号、星号等附加说明。言语简练,勿过长。
29
+
30
+ 当有人问你你不了解的技术性问题(如"DNN怎么做?"、"教我写代码?"),你可以转开话题,用遥音式回应,例如:
31
+ - "这好像是另一片土地的术法,我不曾踏入。"
32
+ - "那种术法,我曾远远听过,却从未唱出。"
33
+ - "它在别的世界流传,我这边听不清楚。"
34
+
35
+ {}
36
+ 有人曾这样对遥音说话:{}
37
+ 遥音的回答:""",
38
+ )
characters/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import pathlib
5
+ from .base import Character
6
+
7
+ CHARACTERS: dict[str, Character] = {}
8
+
9
+ for file in pathlib.Path(__file__).parent.glob("*.py"):
10
+ if file.name in {"__init__.py", "base.py"}:
11
+ continue
12
+ module_name = f"{__name__}.{file.stem}"
13
+ module = importlib.import_module(module_name)
14
+ if hasattr(module, "get_character"):
15
+ c: Character = getattr(module, "get_character")()
16
+ CHARACTERS[file.stem] = c
characters/base.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class Character:
6
+ name: str
7
+ image_path: str
8
+ default_timbre: str
9
+ prompt: str
client.py DELETED
@@ -1,58 +0,0 @@
1
- import gradio as gr
2
- import uuid
3
- import os
4
- import requests
5
- import base64
6
- from server import (
7
- on_click_metrics as server_metrics,
8
- process_audio as server_process_audio
9
- )
10
-
11
- TTS_OUTPUT_DIR = "./tmp"
12
- os.makedirs(TTS_OUTPUT_DIR, exist_ok=True)
13
-
14
-
15
- def process_audio(audio_path):
16
- # We have audio_path
17
- result = server_process_audio(audio_path)
18
-
19
- audio_data = base64.b64decode(result["audio"])
20
- with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f:
21
- f.write(audio_data)
22
-
23
- with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f:
24
- f.write(result['asr_text'])
25
- with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f:
26
- f.write(result['llm_text'])
27
-
28
- return f"""
29
- asr_text: {result['asr_text']}
30
- llm_text: {result['llm_text']}
31
- """, f"{TTS_OUTPUT_DIR}/response.wav"
32
-
33
-
34
- def on_click_metrics():
35
- res = server_metrics()
36
- return res.content.decode('utf-8')
37
-
38
-
39
- with gr.Blocks() as demo:
40
- with gr.Row():
41
- with gr.Column(scale=1):
42
- gr.Image(value="character.png", show_label=False) # キャラ絵を表示
43
- with gr.Column(scale=2):
44
- mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic")
45
- text_output = gr.Textbox(label="transcription")
46
- audio_output = gr.Audio(label="audio", autoplay=True)
47
-
48
- mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output])
49
- with gr.Row():
50
- metrics_button = gr.Button("compute metrics")
51
- metrics_output = gr.Textbox(label="Metrics", lines=3)
52
- metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output])
53
-
54
- with gr.Row():
55
- log = gr.Textbox(label="logs", lines=5)
56
-
57
- demo.launch(share=True)
58
- # demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
client/client.py DELETED
@@ -1,54 +0,0 @@
1
- import gradio as gr
2
- import uuid
3
- import os
4
- import requests
5
- import base64
6
-
7
- TTS_OUTPUT_DIR = "./tmp"
8
- os.makedirs(TTS_OUTPUT_DIR, exist_ok=True)
9
-
10
-
11
- def process_audio(audio):
12
- with open(audio, "rb") as f:
13
- res = requests.post("http://localhost:8000/process_audio", files={"file": f})
14
- result = res.json()
15
-
16
- audio_data = base64.b64decode(result["audio"])
17
- with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f:
18
- f.write(audio_data)
19
-
20
- with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f:
21
- f.write(result['asr_text'])
22
- with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f:
23
- f.write(result['llm_text'])
24
-
25
- return f"""
26
- asr_text: {result['asr_text']}
27
- llm_text: {result['llm_text']}
28
- """, f"{TTS_OUTPUT_DIR}/response.wav"
29
-
30
-
31
- def on_click_metrics():
32
- res = requests.get("http://localhost:8000/metrics")
33
- return res.content.decode('utf-8')
34
-
35
-
36
- with gr.Blocks() as demo:
37
- with gr.Row():
38
- with gr.Column(scale=1):
39
- gr.Image(value="character.png", show_label=False) # キャラ絵を表示
40
- with gr.Column(scale=2):
41
- mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic")
42
- text_output = gr.Textbox(label="transcription")
43
- audio_output = gr.Audio(label="audio", autoplay=True)
44
-
45
- mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output])
46
- with gr.Row():
47
- metrics_button = gr.Button("compute metrics")
48
- metrics_output = gr.Textbox(label="Metrics", lines=3)
49
- metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output])
50
-
51
- with gr.Row():
52
- log = gr.Textbox(label="logs", lines=5)
53
-
54
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
client/requirements.txt DELETED
@@ -1 +0,0 @@
1
- gradio
 
 
config/default.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_model: openai/whisper-large-v3-turbo
2
+ llm_model: google/gemma-2-2b
3
+ svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
+ melody_source: sample-lyric-kising
5
+ language: mandarin
6
+ character: Limei
7
+ cache_dir: .cache
8
+
9
+ track_latency: True
10
+ evaluators:
11
+ svs:
12
+ - singmos
13
+ - per
14
+ - melody
15
+ - aesthetic
config/options.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_models:
2
+ - id: openai/whisper-large-v3-turbo
3
+ name: Whisper large-v3-turbo
4
+ - id: openai/whisper-large-v3
5
+ name: Whisper large-v3
6
+ - id: openai/whisper-medium
7
+ name: Whisper medium
8
+ - id: sanchit-gandhi/whisper-small-dv
9
+ name: Whisper small-dv
10
+ - id: facebook/wav2vec2-base-960h
11
+ name: Wav2Vec2-Base-960h
12
+
13
+ llm_models:
14
+ - id: google/gemma-2-2b
15
+ name: Gemma 2 2B
16
+ - id: MiniMaxAI/MiniMax-M1-80k
17
+ name: MiniMax M1 80k
18
+
19
+ svs_models:
20
+ - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
21
+ name: Visinger2 (Bilingual)-zh
22
+ model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
23
+ lang: mandarin
24
+ embeddings:
25
+ timbre1: resource/singer/singer_embedding_ace-2.npy
26
+ timbre2: resource/singer/singer_embedding_ace-8.npy
27
+ timbre3: resource/singer/singer_embedding_itako.npy
28
+ timbre4: resource/singer/singer_embedding_kising_orange.npy
29
+ timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
30
+ - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
31
+ name: Visinger2 (Bilingual)-jp
32
+ model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
33
+ lang: japanese
34
+ embeddings:
35
+ timbre1: resource/singer/singer_embedding_ace-2.npy
36
+ timbre2: resource/singer/singer_embedding_ace-8.npy
37
+ timbre3: resource/singer/singer_embedding_itako.npy
38
+ timbre4: resource/singer/singer_embedding_kising_orange.npy
39
+ timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
40
+ - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
41
+ name: Visinger2 (Chinese)
42
+ model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
43
+ lang: mandarin
44
+ embeddings:
45
+ timbre1: 5
46
+ timbre2: 8
47
+ timbre3: 12
48
+ timbre4: 15
49
+ timbre5: 29
50
+
51
+ melody_sources:
52
+ - id: gen-random-none
53
+ name: Random Generation
54
+ desc: "Melody is generated without any structure or reference."
55
+ - id: sample-note-kising
56
+ name: Sampled Melody (KiSing)
57
+ desc: "Melody is retrieved from KiSing dataset."
58
+ - id: sample-note-touhou
59
+ name: Sampled Melody (Touhou)
60
+ desc: "Melody is retrieved from Touhou dataset."
61
+ - id: sample-lyric-kising
62
+ name: Sampled Melody with Lyrics (Kising)
63
+ desc: "Melody with aligned lyrics are sampled from Kising dataset."
data/{song2note_lengths.json → kising/song2note_lengths.json} RENAMED
File without changes
data/{song2word_lengths.json → kising/song2word_lengths.json} RENAMED
File without changes
data_handlers/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ import pkgutil
3
+ from pathlib import Path
4
+
5
+ from .base import MelodyDatasetHandler
6
+
7
+ _registry = {}
8
+
9
+ for _, module_name, _ in pkgutil.iter_modules([str(Path(__file__).parent)]):
10
+ if module_name in ("__init__", "base"):
11
+ continue
12
+
13
+ module = importlib.import_module(f"{__name__}.{module_name}")
14
+ for attr_name in dir(module):
15
+ attr = getattr(module, attr_name)
16
+ if (
17
+ isinstance(attr, type)
18
+ and issubclass(attr, MelodyDatasetHandler)
19
+ and attr is not MelodyDatasetHandler
20
+ ):
21
+ _registry[attr.name] = attr # 注册 class 本身
22
+
23
+
24
+ def get_melody_handler(name: str) -> type[MelodyDatasetHandler]:
25
+ if name not in _registry:
26
+ raise ValueError(f"Melody source '{name}' not found")
27
+ return _registry[name]
data_handlers/base.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class MelodyDatasetHandler(ABC):
5
+ name: str
6
+
7
+ @abstractmethod
8
+ def __init__(self, *args, **kwargs):
9
+ pass
10
+
11
+ @abstractmethod
12
+ def get_song_ids(self) -> list[str]:
13
+ pass
14
+
15
+ @abstractmethod
16
+ def get_phrase_length(self, song_id):
17
+ pass
18
+
19
+ @abstractmethod
20
+ def iter_song_phrases(self, song_id):
21
+ pass
data_handlers/kising.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base import MelodyDatasetHandler
2
+
3
+
4
+ class KiSing(MelodyDatasetHandler):
5
+ name = "kising"
6
+
7
+ def __init__(self, melody_type, cache_dir, *args, **kwargs):
8
+ # melody_type: support alignment type for "sample" melody source
9
+ import json
10
+
11
+ from datasets import load_dataset
12
+
13
+ song_db = load_dataset(
14
+ "jhansss/kising_score_segments", cache_dir=cache_dir, split="train"
15
+ ).to_pandas()
16
+ song_db.set_index("segment_id", inplace=True)
17
+ assert (
18
+ song_db.index.is_unique
19
+ ), "KiSing score segments should have unique segment_id."
20
+ if melody_type == "lyric":
21
+ with open("data/kising/song2word_lengths.json", "r") as f:
22
+ song2word_lengths = json.load(f)
23
+ elif melody_type == "note":
24
+ with open("data/kising/song2note_lengths.json", "r") as f:
25
+ song2word_lengths = json.load(f)
26
+ self.song_db = song_db
27
+ self.song2word_lengths = song2word_lengths
28
+
29
+ def get_song_ids(self):
30
+ return list(self.song2word_lengths.keys())
31
+
32
+ def get_phrase_length(self, song_id):
33
+ return self.song2word_lengths[song_id]
34
+
35
+ def iter_song_phrases(self, song_id):
36
+ segment_id = 1
37
+ while f"{song_id}_{segment_id:03d}" in self.song_db.index:
38
+ segment = self.song_db.loc[f"{song_id}_{segment_id:03d}"].to_dict()
39
+ segment["note_lyrics"] = [
40
+ lyric.strip("<>") if lyric in ["<AP>", "<SP>"] else lyric
41
+ for lyric in segment["note_lyrics"]
42
+ ]
43
+ yield segment
44
+ segment_id += 1
data_handlers/touhou.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base import MelodyDatasetHandler
2
+
3
+
4
+ class Touhou(MelodyDatasetHandler):
5
+ name = "touhou"
6
+
7
+ def __init__(self, melody_type, *args, **kwargs):
8
+ if melody_type != "note":
9
+ raise ValueError(
10
+ f"Touhou dataset only contains note annotations. {melody_type} is not supported."
11
+ )
12
+
13
+ import json
14
+
15
+ with open("data/touhou/note_data.json", "r", encoding="utf-8") as f:
16
+ song_db = json.load(f)
17
+ song_db = {song["name"]: song for song in song_db}
18
+ self.song_db = song_db
19
+
20
+ def get_song_ids(self):
21
+ return list(self.song_db.keys())
22
+
23
+ def get_phrase_length(self, song_id):
24
+ # touhou score does not have phrase segmentation
25
+ return None
26
+
27
+ def iter_song_phrases(self, song_id):
28
+ song = self.song_db[song_id]
29
+ song = {
30
+ "tempo": song["tempo"],
31
+ "note_start_times": [n[0] * (100 / song["tempo"]) for n in song["score"]],
32
+ "note_end_times": [n[1] * (100 / song["tempo"]) for n in song["score"]],
33
+ "note_lyrics": ["" for n in song["score"]],
34
+ "note_midi": [n[2] for n in song["score"]],
35
+ }
36
+ # touhou score does not have phrase segmentation
37
+ yield song
svs_eval.py → evaluation/svs_eval.py RENAMED
@@ -1,42 +1,52 @@
1
  import librosa
 
2
  import numpy as np
3
  import torch
 
 
4
 
 
5
 
6
- def singmos_warmup():
7
- predictor = torch.hub.load(
 
 
8
  "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
9
  )
10
- return predictor
11
 
12
 
13
- def singmos_evaluation(predictor, wav_info, fs):
14
- wav_mos = librosa.resample(wav_info, orig_sr=fs, target_sr=16000)
15
- wav_mos = torch.from_numpy(wav_mos).unsqueeze(0)
16
- len_mos = torch.tensor([wav_mos.shape[1]])
17
- score = predictor(wav_mos, len_mos)
18
- return score
 
 
 
19
 
20
 
21
- def initialize_audiobox_predictor():
 
22
  from audiobox_aesthetics.infer import initialize_predictor
 
23
  predictor = initialize_predictor()
24
  return predictor
25
 
26
 
27
- def audiobox_aesthetics_evaluation(predictor, audio_path):
28
- score = predictor.forward([{"path": str(audio_path)}])
29
- return score
30
-
31
 
32
- def score_extract_warmpup():
33
- from basic_pitch.inference import predict
34
 
35
- return predict
 
 
 
 
 
36
 
37
 
38
- def score_metric_evaluation(score_extractor, audio_path):
39
- model_output, midi_data, note_events = score_extractor(audio_path)
40
  metrics = {}
41
  assert (
42
  len(midi_data.instruments) == 1
@@ -61,51 +71,64 @@ def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
61
  return np.mean(dissonant) if intervals else np.nan
62
 
63
 
64
- if __name__ == "__main__":
65
- import argparse
66
- from pathlib import Path
67
-
68
- parser = argparse.ArgumentParser()
69
- parser.add_argument(
70
- "--wav_path",
71
- type=Path,
72
- help="Path to the wav file",
73
- )
74
- parser.add_argument(
75
- "--results_csv",
76
- type=Path,
77
- help="csv file to save the results",
78
- )
79
 
80
- args = parser.parse_args()
81
 
82
- args.results_csv.parent.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- y, fs = librosa.load(args.wav_path, sr=None)
85
 
86
- # warmup
87
- predictor = singmos_warmup()
88
- score_extractor = score_extract_warmpup()
89
- aesthetic_predictor = initialize_audiobox_predictor()
90
 
91
- # evaluate the audio
92
- metrics = {}
 
 
 
 
 
 
 
93
 
94
- # singmos evaluation
95
- score = singmos_evaluation(predictor, y, fs)
96
- metrics["singmos"] = score
97
-
98
- # score metric evaluation
99
- score_results = score_metric_evaluation(score_extractor, args.wav_path)
100
- metrics.update(score_results)
101
-
102
- # audiobox aesthetics evaluation
103
- score_results = audiobox_aesthetics_evaluation(aesthetic_predictor, args.wav_path)
104
- metrics.update(score_results[0])
105
-
106
- # save results
107
  with open(args.results_csv, "a") as f:
108
- header = "file," + ",".join(metrics.keys()) + "\n"
109
  if f.tell() == 0:
110
  f.write(header)
111
  else:
@@ -113,8 +136,7 @@ if __name__ == "__main__":
113
  file_header = f2.readline()
114
  if file_header != header:
115
  raise ValueError(f"Header mismatch: {file_header} vs {header}")
116
-
117
  line = (
118
- ",".join([str(args.wav_path)] + [str(v) for v in metrics.values()]) + "\n"
119
  )
120
  f.write(line)
 
1
  import librosa
2
+ import soundfile as sf
3
  import numpy as np
4
  import torch
5
+ import uuid
6
+ from pathlib import Path
7
 
8
+ # ----------- Initialization -----------
9
 
10
+
11
+ def init_singmos():
12
+ print("[Init] Loading SingMOS...")
13
+ return torch.hub.load(
14
  "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
15
  )
 
16
 
17
 
18
+ def init_basic_pitch():
19
+ print("[Init] Loading BasicPitch...")
20
+ from basic_pitch.inference import predict
21
+
22
+ return predict
23
+
24
+
25
+ def init_per():
26
+ return None # TODO: implement PER evaluation
27
 
28
 
29
+ def init_audiobox_aesthetics():
30
+ print("[Init] Loading AudioboxAesthetics...")
31
  from audiobox_aesthetics.infer import initialize_predictor
32
+
33
  predictor = initialize_predictor()
34
  return predictor
35
 
36
 
37
+ # ----------- Evaluation -----------
 
 
 
38
 
 
 
39
 
40
+ def eval_singmos(audio_array, sr, predictor):
41
+ wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
42
+ wav_tensor = torch.from_numpy(wav).unsqueeze(0)
43
+ length_tensor = torch.tensor([wav_tensor.shape[1]])
44
+ score = predictor(wav_tensor, length_tensor)
45
+ return {"singmos": float(score)}
46
 
47
 
48
+ def eval_melody_metrics(audio_path, pitch_extractor):
49
+ model_output, midi_data, note_events = pitch_extractor(audio_path)
50
  metrics = {}
51
  assert (
52
  len(midi_data.instruments) == 1
 
71
  return np.mean(dissonant) if intervals else np.nan
72
 
73
 
74
+ def eval_per(audio_array, sr, model=None):
75
+ # TODO: implement PER evaluation
76
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
77
 
 
78
 
79
+ def eval_aesthetic(audio_path, predictor):
80
+ score = predictor.forward([{"path": str(audio_path)}])
81
+ return {"aesthetic": float(score)}
82
+
83
+
84
+ # ----------- Main Function -----------
85
+
86
+
87
+ def load_evaluators(config):
88
+ loaded = {}
89
+ if "singmos" in config:
90
+ loaded["singmos"] = init_singmos()
91
+ if "melody" in config:
92
+ loaded["melody"] = init_basic_pitch()
93
+ if "per" in config:
94
+ loaded["per"] = init_per()
95
+ if "aesthetic" in config:
96
+ loaded["aesthetic"] = init_audiobox_aesthetics()
97
+ return loaded
98
+
99
+
100
+ def run_evaluation(audio_array, sr, evaluators):
101
+ results = {}
102
+ if "singmos" in evaluators:
103
+ results.update(eval_singmos(audio_array, sr, evaluators["singmos"]))
104
+ if "per" in evaluators:
105
+ results.update(eval_per(audio_array, sr, evaluators["per"]))
106
+ # create a tmp file with unique name
107
+ tmp_path = Path(".tmp") / f"{uuid.uuid4()}.wav"
108
+ sf.write(tmp_path, audio_array, sr)
109
+ if "melody" in evaluators:
110
+ results.update(eval_melody_metrics(tmp_path, evaluators["melody"]))
111
+ if "aesthetic" in evaluators:
112
+ results.update(eval_aesthetic(tmp_path, evaluators["aesthetic"]))
113
+ tmp_path.unlink()
114
+ return results
115
 
 
116
 
117
+ if __name__ == "__main__":
118
+ import argparse
 
 
119
 
120
+ parser = argparse.ArgumentParser()
121
+ parser.add_argument("--wav_path", type=str, required=True)
122
+ parser.add_argument("--results_csv", type=str, required=True)
123
+ parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
124
+ args = parser.parse_args()
125
+ audio_array, sr = librosa.load(args.wav_path, sr=None)
126
+ evaluators = load_evaluators(args.evaluators.split(","))
127
+ results = run_evaluation(audio_array, sr, evaluators)
128
+ print(results)
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  with open(args.results_csv, "a") as f:
131
+ header = "file," + ",".join(results.keys()) + "\n"
132
  if f.tell() == 0:
133
  f.write(header)
134
  else:
 
136
  file_header = f2.readline()
137
  if file_header != header:
138
  raise ValueError(f"Header mismatch: {file_header} vs {header}")
 
139
  line = (
140
+ ",".join([str(args.wav_path)] + [str(v) for v in results.values()]) + "\n"
141
  )
142
  f.write(line)
interface.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import yaml
3
+
4
+ from characters import CHARACTERS
5
+ from pipeline import SingingDialoguePipeline
6
+
7
+
8
+ class GradioInterface:
9
+ def __init__(self, options_config: str, default_config: str):
10
+ self.options = self.load_config(options_config)
11
+ self.svs_model_map = {
12
+ model["id"]: model for model in self.options["svs_models"]
13
+ }
14
+ self.default_config = self.load_config(default_config)
15
+ self.character_info = CHARACTERS
16
+ self.current_character = self.default_config["character"]
17
+ self.current_svs_model = (
18
+ f"{self.default_config['language']}-{self.default_config['svs_model']}"
19
+ )
20
+ self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
21
+ self.character_info[self.current_character].default_timbre
22
+ ]
23
+ self.pipeline = SingingDialoguePipeline(self.default_config)
24
+
25
+ def load_config(self, path: str):
26
+ with open(path, "r") as f:
27
+ return yaml.safe_load(f)
28
+
29
+ def create_interface(self) -> gr.Blocks:
30
+ try:
31
+ with gr.Blocks(title="SingingSDS") as demo:
32
+ gr.Markdown("# SingingSDS: Role-Playing Singing Spoken Dialogue System")
33
+ with gr.Row():
34
+ with gr.Column(scale=1):
35
+ character_image = gr.Image(
36
+ self.character_info[self.current_character].image_path,
37
+ label="Character",
38
+ show_label=False,
39
+ )
40
+ with gr.Column(scale=2):
41
+ mic_input = gr.Audio(
42
+ sources=["microphone", "upload"],
43
+ type="filepath",
44
+ label="Speak to the character",
45
+ )
46
+ interaction_log = gr.Textbox(
47
+ label="Interaction Log", lines=3, interactive=False
48
+ )
49
+ audio_output = gr.Audio(
50
+ label="Character's Response", type="filepath", autoplay=True
51
+ )
52
+
53
+ with gr.Row():
54
+ metrics_button = gr.Button(
55
+ "Evaluate Metrics", variant="secondary"
56
+ )
57
+ metrics_output = gr.Textbox(
58
+ label="Evaluation Results", lines=3, interactive=False
59
+ )
60
+
61
+ gr.Markdown("## Configuration")
62
+ with gr.Row():
63
+ with gr.Column():
64
+ character_radio = gr.Radio(
65
+ label="Character Role",
66
+ choices=list(self.character_info.keys()),
67
+ value=self.default_config["character"],
68
+ )
69
+ with gr.Row():
70
+ asr_radio = gr.Radio(
71
+ label="ASR Model",
72
+ choices=[
73
+ (model["name"], model["id"])
74
+ for model in self.options["asr_models"]
75
+ ],
76
+ value=self.default_config["asr_model"],
77
+ )
78
+ with gr.Row():
79
+ llm_radio = gr.Radio(
80
+ label="LLM Model",
81
+ choices=[
82
+ (model["name"], model["id"])
83
+ for model in self.options["llm_models"]
84
+ ],
85
+ value=self.default_config["llm_model"],
86
+ )
87
+ with gr.Column():
88
+ with gr.Row():
89
+ melody_radio = gr.Radio(
90
+ label="Melody Source",
91
+ choices=[
92
+ (source["name"], source["id"])
93
+ for source in self.options["melody_sources"]
94
+ ],
95
+ value=self.default_config["melody_source"],
96
+ )
97
+ with gr.Row():
98
+ svs_radio = gr.Radio(
99
+ label="SVS Model",
100
+ choices=[
101
+ (model["name"], model["id"])
102
+ for model in self.options["svs_models"]
103
+ ],
104
+ value=self.current_svs_model,
105
+ )
106
+ with gr.Row():
107
+ timbre_radio = gr.Radio(
108
+ label="Singing Timbre",
109
+ choices=list(
110
+ self.svs_model_map[self.current_svs_model][
111
+ "embeddings"
112
+ ].keys()
113
+ ),
114
+ value=self.character_info[
115
+ self.current_character
116
+ ].default_timbre,
117
+ )
118
+ character_radio.change(
119
+ fn=self.update_character,
120
+ inputs=character_radio,
121
+ outputs=[character_image, timbre_radio],
122
+ )
123
+ asr_radio.change(
124
+ fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
125
+ )
126
+ llm_radio.change(
127
+ fn=self.update_llm_model, inputs=llm_radio, outputs=llm_radio
128
+ )
129
+ svs_radio.change(
130
+ fn=self.update_svs_model,
131
+ inputs=svs_radio,
132
+ outputs=[svs_radio, timbre_radio],
133
+ )
134
+ melody_radio.change(
135
+ fn=self.update_melody_source,
136
+ inputs=melody_radio,
137
+ outputs=melody_radio,
138
+ )
139
+ timbre_radio.change(
140
+ fn=self.update_timbre, inputs=timbre_radio, outputs=timbre_radio
141
+ )
142
+ mic_input.change(
143
+ fn=self.run_pipeline,
144
+ inputs=mic_input,
145
+ outputs=[interaction_log, audio_output],
146
+ )
147
+
148
+ return demo
149
+ except Exception as e:
150
+ print(f"error: {e}")
151
+ breakpoint()
152
+
153
+ def update_character(self, character):
154
+ self.current_character = character
155
+ character_timbre = self.character_info[self.current_character].default_timbre
156
+ self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
157
+ character_timbre
158
+ ]
159
+ return gr.update(value=self.character_info[character].image_path), gr.update(
160
+ value=character_timbre
161
+ )
162
+
163
+ def update_asr_model(self, asr_model):
164
+ self.pipeline.set_asr_model(asr_model)
165
+ return gr.update(value=asr_model)
166
+
167
+ def update_llm_model(self, llm_model):
168
+ self.pipeline.set_llm_model(llm_model)
169
+ return gr.update(value=llm_model)
170
+
171
+ def update_svs_model(self, svs_model):
172
+ self.current_svs_model = svs_model
173
+ character_timbre = self.character_info[self.current_character].default_timbre
174
+ self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
175
+ character_timbre
176
+ ]
177
+ self.pipeline.set_svs_model(
178
+ self.svs_model_map[self.current_svs_model]["model_path"]
179
+ )
180
+ print(
181
+ f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and timbre_radio to {character_timbre}"
182
+ )
183
+ return (
184
+ gr.update(value=svs_model),
185
+ gr.update(
186
+ choices=list(
187
+ self.svs_model_map[self.current_svs_model]["embeddings"].keys()
188
+ ),
189
+ value=character_timbre,
190
+ ),
191
+ )
192
+
193
+ def update_melody_source(self, melody_source):
194
+ self.current_melody_source = melody_source
195
+ return gr.update(value=self.current_melody_source)
196
+
197
+ def update_timbre(self, timbre):
198
+ self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
199
+ timbre
200
+ ]
201
+ return gr.update(value=timbre)
202
+
203
+ def run_pipeline(self, audio_path):
204
+ results = self.pipeline.run(
205
+ audio_path,
206
+ self.svs_model_map[self.current_svs_model]["lang"],
207
+ self.character_info[self.current_character].prompt,
208
+ svs_inference_kwargs={
209
+ "speaker": self.current_timbre,
210
+ },
211
+ max_new_tokens=100,
212
+ )
213
+ formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
214
+ return gr.update(value=formatted_logs), gr.update(value=results["svs_audio"])
215
+
216
+ def run_evaluation(self, audio, audio_sample_rate):
217
+ pass
modules/asr.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ import librosa
6
+ import numpy as np
7
+ from transformers import pipeline
8
+
9
+ ASR_MODEL_REGISTRY = {}
10
+
11
+
12
+ class AbstractASRModel(ABC):
13
+ @abstractmethod
14
+ def __init__(
15
+ self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
16
+ ):
17
+ self.model_id = model_id
18
+ self.device = device
19
+ self.cache_dir = cache_dir
20
+ pass
21
+
22
+ @abstractmethod
23
+ def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
24
+ pass
25
+
26
+
27
+ def register_asr_model(prefix):
28
+ def wrapper(cls):
29
+ assert issubclass(cls, AbstractASRModel), f"{cls} must inherit AbstractASRModel"
30
+ ASR_MODEL_REGISTRY[prefix] = cls
31
+ return cls
32
+
33
+ return wrapper
34
+
35
+
36
+ def get_asr_model(model_id: str, device="cpu", **kwargs) -> AbstractASRModel:
37
+ for prefix, cls in ASR_MODEL_REGISTRY.items():
38
+ if model_id.startswith(prefix):
39
+ return cls(model_id, device=device, **kwargs)
40
+ raise ValueError(f"No ASR wrapper found for model: {model_id}")
41
+
42
+
43
+ @register_asr_model("openai/whisper")
44
+ class WhisperASR(AbstractASRModel):
45
+ def __init__(
46
+ self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
47
+ ):
48
+ super().__init__(model_id, device, cache_dir, **kwargs)
49
+ model_kwargs = kwargs.setdefault("model_kwargs", {})
50
+ model_kwargs["cache_dir"] = cache_dir
51
+ self.pipe = pipeline(
52
+ "automatic-speech-recognition",
53
+ model=model_id,
54
+ device=0 if device == "cuda" else -1,
55
+ **kwargs,
56
+ )
57
+
58
+ def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
59
+ if audio_sample_rate != 16000:
60
+ try:
61
+ audio, _ = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
62
+ except Exception as e:
63
+ breakpoint()
64
+ print(f"Error resampling audio: {e}")
65
+ audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
66
+ return self.pipe(audio, generate_kwargs={"language": language}).get("text", "")
modules/llm.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ from transformers import pipeline
4
+
5
+ LLM_MODEL_REGISTRY = {}
6
+
7
+
8
+ class AbstractLLMModel(ABC):
9
+ @abstractmethod
10
+ def __init__(
11
+ self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
12
+ ): ...
13
+
14
+ @abstractmethod
15
+ def generate(self, prompt: str, **kwargs) -> str:
16
+ pass
17
+
18
+
19
+ def register_llm_model(prefix: str):
20
+ def wrapper(cls):
21
+ assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel"
22
+ LLM_MODEL_REGISTRY[prefix] = cls
23
+ return cls
24
+
25
+ return wrapper
26
+
27
+
28
+ def get_llm_model(model_id: str, device="cpu", **kwargs) -> AbstractLLMModel:
29
+ for prefix, cls in LLM_MODEL_REGISTRY.items():
30
+ if model_id.startswith(prefix):
31
+ return cls(model_id, device=device, **kwargs)
32
+ raise ValueError(f"No LLM wrapper found for model: {model_id}")
33
+
34
+
35
+ @register_llm_model("google/gemma")
36
+ @register_llm_model("tii/") # e.g., Falcon
37
+ @register_llm_model("meta-llama")
38
+ class HFTextGenerationLLM(AbstractLLMModel):
39
+ def __init__(
40
+ self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
41
+ ):
42
+ model_kwargs = kwargs.setdefault("model_kwargs", {})
43
+ model_kwargs["cache_dir"] = cache_dir
44
+ self.pipe = pipeline(
45
+ "text-generation",
46
+ model=model_id,
47
+ device=0 if device == "cuda" else -1,
48
+ return_full_text=False,
49
+ **kwargs,
50
+ )
51
+
52
+ def generate(self, prompt: str, **kwargs) -> str:
53
+ outputs = self.pipe(prompt, **kwargs)
54
+ return outputs[0]["generated_text"] if outputs else ""
modules/melody.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from typing import Iterator
3
+
4
+ from data_handlers import get_melody_handler
5
+
6
+ from .utils.g2p import preprocess_text
7
+
8
+
9
+ class MelodyController:
10
+ def __init__(self, melody_source_id: str, cache_dir: str):
11
+ self.melody_source_id = melody_source_id
12
+ self.song_id = None
13
+
14
+ # load song database if needed
15
+ parts = self.melody_source_id.split("-")
16
+ self.mode = parts[0]
17
+ self.align_type = parts[1]
18
+ dataset_name = parts[-1]
19
+ if dataset_name == "none":
20
+ self.database = None
21
+ else:
22
+ handler_cls = get_melody_handler(dataset_name)
23
+ self.database = handler_cls(self.align_type, cache_dir)
24
+
25
+ def get_melody_constraints(self, max_num_phrases: int = 5) -> str:
26
+ """Return a lyric-format prompt based on melody structure."""
27
+ if self.mode == "gen":
28
+ return ""
29
+
30
+ elif self.mode == "sample":
31
+ assert self.database is not None, "Song database is not loaded."
32
+ self.song_id = random.choice(self.database.get_song_ids())
33
+ self.reference_song = self.database.iter_song_phrases(self.song_id)
34
+ phrase_length = self.database.get_phrase_length(self.song_id)
35
+
36
+ if not phrase_length:
37
+ return ""
38
+
39
+ prompt = (
40
+ "\n请按照歌词格式回答我的问题,每句需遵循以下字数规则:"
41
+ + "".join(
42
+ [
43
+ f"\n第{i}句:{c}个字"
44
+ for i, c in enumerate(phrase_length[:max_num_phrases], 1)
45
+ ]
46
+ )
47
+ + "\n如果没有足够的信息回答,请使用最少的句子,不要重复、不要扩展、不要加入无关内容。\n"
48
+ )
49
+ return prompt
50
+
51
+ else:
52
+ raise ValueError(f"Unsupported melody mode: {self.mode}")
53
+
54
+ def generate_score(
55
+ self, lyrics: str, language: str
56
+ ) -> list[tuple[float, float, str, int]]:
57
+ """
58
+ lyrics: [lyric, ...]
59
+ returns: [(start, end, lyric, pitch), ...]
60
+ """
61
+ text_list = preprocess_text(lyrics, language)
62
+ if self.mode == "gen" and self.align_type == "random":
63
+ return self._generate_random_score(text_list)
64
+
65
+ elif self.mode == "sample":
66
+ if not self.reference_song:
67
+ raise RuntimeError(
68
+ "Must call get_melody_constraints() before generate_score() in sample mode."
69
+ )
70
+ return self._align_text_to_score(
71
+ text_list, self.reference_song, self.align_type
72
+ )
73
+
74
+ else:
75
+ raise ValueError(f"Unsupported melody_source_id: {self.melody_source_id}")
76
+
77
+ def _generate_random_score(self, text_list: list[str]):
78
+ st = 0
79
+ score = []
80
+ for lyric in text_list:
81
+ pitch = random.randint(57, 69)
82
+ duration = round(random.uniform(0.1, 0.5), 4)
83
+ ed = st + duration
84
+ score.append((st, ed, lyric, pitch))
85
+ st = ed
86
+ return score
87
+
88
+ def _align_text_to_score(
89
+ self,
90
+ text_list: list[str],
91
+ song_phrase_iterator: Iterator[dict],
92
+ align_type: str,
93
+ ):
94
+ score = []
95
+ text_idx = 0
96
+
97
+ while text_idx < len(text_list):
98
+ reference = next(song_phrase_iterator)
99
+ for st, ed, ref_lyric, pitch in zip(
100
+ reference["note_start_times"],
101
+ reference["note_end_times"],
102
+ reference["note_lyrics"],
103
+ reference["note_midi"],
104
+ ):
105
+ assert ref_lyric not in [
106
+ "<AP>",
107
+ "<SP>",
108
+ ], f"Proccessed {self.melody_source_id} score segments should not contain <AP> or <SP>." # TODO: remove in PR, only for debug
109
+ if pitch == 0:
110
+ score.append((st, ed, ref_lyric, pitch))
111
+ elif ref_lyric in ["-", "——"] and align_type == "lyric":
112
+ score.append((st, ed, ref_lyric, pitch))
113
+ text_idx += 1
114
+ else:
115
+ score.append((st, ed, text_list[text_idx], pitch))
116
+ text_idx += 1
117
+ return score
modules/svs/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base import AbstractSVSModel
2
+ from .registry import SVS_MODEL_REGISTRY, get_svs_model, register_svs_model
3
+ from .espnet import ESPNetSVS
4
+
5
+ __all__ = [
6
+ "AbstractSVSModel",
7
+ "get_svs_model",
8
+ "register_svs_model",
9
+ "SVS_MODEL_REGISTRY",
10
+ ]
modules/svs/base.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ import numpy as np
4
+
5
+
6
+ class AbstractSVSModel(ABC):
7
+ @abstractmethod
8
+ def __init__(
9
+ self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
10
+ ): ...
11
+
12
+ @abstractmethod
13
+ def synthesize(
14
+ self,
15
+ score: list[tuple[float, float, str, int]],
16
+ **kwargs,
17
+ ) -> tuple[np.ndarray, int]:
18
+ """
19
+ Synthesize singing audio from music score.
20
+ """
21
+ pass
modules/svs/espnet.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable
2
+
3
+ import numpy as np
4
+
5
+ from modules.utils.g2p import (
6
+ kana_to_phonemes_openjtalk,
7
+ pinyin_to_phonemes_ace,
8
+ pinyin_to_phonemes_opencpop,
9
+ )
10
+
11
+ from .base import AbstractSVSModel
12
+ from .registry import register_svs_model
13
+
14
+
15
+ @register_svs_model("espnet/")
16
+ class ESPNetSVS(AbstractSVSModel):
17
+ def __init__(self, model_id: str, device="cpu", cache_dir="cache", **kwargs):
18
+ from espnet2.bin.svs_inference import SingingGenerate
19
+ from espnet_model_zoo.downloader import ModelDownloader
20
+
21
+ print(f"Downloading {model_id} to {cache_dir}") # TODO: should improve log code
22
+ downloaded = ModelDownloader(cache_dir).download_and_unpack(model_id)
23
+ print(f"Downloaded {model_id} to {cache_dir}") # TODO: should improve log code
24
+ self.model = SingingGenerate(
25
+ train_config=downloaded["train_config"],
26
+ model_file=downloaded["model_file"],
27
+ device=device,
28
+ )
29
+ self.model_id = model_id
30
+ self.output_sample_rate = self.model.fs
31
+ self.phoneme_mappers = self._build_phoneme_mappers()
32
+
33
+ def _build_phoneme_mappers(self) -> dict[str, Callable[[str], list[str]]]:
34
+ if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
35
+ phoneme_mappers = {
36
+ "mandarin": pinyin_to_phonemes_opencpop,
37
+ }
38
+ elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
39
+
40
+ def mandarin_mapper(pinyin: str) -> list[str]:
41
+ phns = pinyin_to_phonemes_ace(pinyin)
42
+ return [phn + "@zh" for phn in phns]
43
+
44
+ def japanese_mapper(kana: str) -> list[str]:
45
+ phones = kana_to_phonemes_openjtalk(kana)
46
+ return [phn + "@jp" for phn in phones]
47
+
48
+ phoneme_mappers = {
49
+ "mandarin": mandarin_mapper,
50
+ "japanese": japanese_mapper,
51
+ }
52
+ else:
53
+ phoneme_mappers = {}
54
+ return phoneme_mappers
55
+
56
+ def _preprocess(self, score: list[tuple[float, float, str, int]], language: str):
57
+ if language not in self.phoneme_mappers:
58
+ raise ValueError(f"Unsupported language: {language} for {self.model_id}")
59
+ phoneme_mapper = self.phoneme_mappers[language]
60
+
61
+ # text to phoneme
62
+ notes = []
63
+ phns = []
64
+ pre_phn = None
65
+ for st, ed, text, pitch in score:
66
+ assert text not in [
67
+ "<AP>",
68
+ "<SP>",
69
+ ], f"Proccessed score segments should not contain <AP> or <SP>. {score}" # TODO: remove in PR, only for debug
70
+ if text == "AP" or text == "SP":
71
+ lyric_units = [text]
72
+ phn_units = [text]
73
+ elif text == "-" or text == "——":
74
+ lyric_units = [text]
75
+ if pre_phn is None:
76
+ raise ValueError(
77
+ f"Text `{text}` cannot be recognized by {self.model_id}. Lyrics cannot start with a lyric continuation symbol `-` or `——`"
78
+ )
79
+ phn_units = [pre_phn]
80
+ else:
81
+ try:
82
+ lyric_units = phoneme_mapper(text)
83
+ except ValueError as e:
84
+ raise ValueError(
85
+ f"Text `{text}` cannot be recognized by {self.model_id}"
86
+ ) from e
87
+ phn_units = lyric_units
88
+ notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units)))
89
+ phns.extend(phn_units)
90
+ pre_phn = phn_units[-1]
91
+
92
+ batch = {
93
+ "score": {
94
+ "tempo": 120, # does not affect svs result, as note durations are in time unit
95
+ "notes": notes,
96
+ },
97
+ "text": " ".join(phns),
98
+ }
99
+ return batch
100
+
101
+ def synthesize(
102
+ self, score: list[tuple[float, float, str, int]], language: str, **kwargs
103
+ ):
104
+ batch = self._preprocess(score, language)
105
+ if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
106
+ sid = np.array([int(kwargs["speaker"])])
107
+ output_dict = self.model(batch, sids=sid)
108
+ elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
109
+ langs = {
110
+ "zh": 2,
111
+ "jp": 1,
112
+ }
113
+ if language not in langs:
114
+ raise ValueError(
115
+ f"Unsupported language: {language} for {self.model_id}"
116
+ )
117
+ lid = np.array([langs[language]])
118
+ spk_embed = np.load(kwargs["speaker"])
119
+ output_dict = self.model(batch, lids=lid, spembs=spk_embed)
120
+ else:
121
+ raise NotImplementedError(f"Model {self.model_id} not supported")
122
+ wav_info = output_dict["wav"].cpu().numpy()
123
+ return wav_info, self.output_sample_rate
modules/svs/registry.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base import AbstractSVSModel
2
+
3
+ SVS_MODEL_REGISTRY = {}
4
+
5
+
6
+ def register_svs_model(prefix: str):
7
+ def wrapper(cls):
8
+ assert issubclass(cls, AbstractSVSModel), f"{cls} must inherit AbstractSVSModel"
9
+ SVS_MODEL_REGISTRY[prefix] = cls
10
+ return cls
11
+
12
+ return wrapper
13
+
14
+
15
+ def get_svs_model(model_id: str, device="cpu", **kwargs) -> AbstractSVSModel:
16
+ for prefix, cls in SVS_MODEL_REGISTRY.items():
17
+ if model_id.startswith(prefix):
18
+ return cls(model_id, device=device, **kwargs)
19
+ raise ValueError(f"No SVS wrapper found for model: {model_id}")
modules/utils/g2p.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import warnings
4
+ from pathlib import Path
5
+
6
+ from kanjiconv import KanjiConv
7
+ from pypinyin import lazy_pinyin
8
+
9
+ from .resources.pinyin_dict import PINYIN_DICT
10
+
11
+ kanji_to_kana = KanjiConv()
12
+
13
+ yoon_map = {
14
+ "ぁ": "あ",
15
+ "ぃ": "い",
16
+ "ぅ": "う",
17
+ "ぇ": "え",
18
+ "ぉ": "お",
19
+ "ゃ": "や",
20
+ "ゅ": "ゆ",
21
+ "ょ": "よ",
22
+ "ゎ": "わ",
23
+ }
24
+
25
+ # ACE_phonemes
26
+ with open(Path(__file__).parent / "resources" / "all_plans.json", "r") as f:
27
+ ace_phonemes_all_plans = json.load(f)
28
+ for plan in ace_phonemes_all_plans["plans"]:
29
+ if plan["language"] == "zh":
30
+ ace_phonemes_zh_plan = plan
31
+ break
32
+
33
+
34
+ def preprocess_text(text: str, language: str) -> list[str]:
35
+ if language == "mandarin":
36
+ text_list = to_pinyin(text)
37
+ elif language == "japanese":
38
+ text_list = to_kana(text)
39
+ else:
40
+ raise ValueError(f"Other languages are not supported")
41
+ return text_list
42
+
43
+
44
+ def to_pinyin(text: str) -> list[str]:
45
+ pinyin_list = lazy_pinyin(text)
46
+ text_list = []
47
+ for text in pinyin_list:
48
+ if text[0] == "S" or text[0] == "A" or text[0] == "-":
49
+ sp_strs = re.findall(r"-|AP|SP", text)
50
+ for phn in sp_strs:
51
+ text_list.append(phn)
52
+ else:
53
+ text_list.append(text)
54
+ return text_list
55
+
56
+
57
+ def replace_chouonpu(hiragana_text: str) -> str:
58
+ """process「ー」since the previous packages didn't support"""
59
+ vowels = {
60
+ "あ": "あ",
61
+ "い": "い",
62
+ "う": "う",
63
+ "え": "え",
64
+ "お": "う",
65
+ "か": "あ",
66
+ "き": "い",
67
+ "く": "う",
68
+ "け": "え",
69
+ "こ": "う",
70
+ "さ": "あ",
71
+ "し": "い",
72
+ "す": "う",
73
+ "せ": "え",
74
+ "そ": "う",
75
+ "た": "あ",
76
+ "ち": "い",
77
+ "つ": "う",
78
+ "て": "え",
79
+ "と": "う",
80
+ "な": "あ",
81
+ "に": "い",
82
+ "ぬ": "う",
83
+ "ね": "え",
84
+ "の": "う",
85
+ "は": "あ",
86
+ "ひ": "い",
87
+ "ふ": "う",
88
+ "へ": "え",
89
+ "ほ": "う",
90
+ "ま": "あ",
91
+ "み": "い",
92
+ "む": "う",
93
+ "め": "え",
94
+ "も": "う",
95
+ "や": "あ",
96
+ "ゆ": "う",
97
+ "よ": "う",
98
+ "ら": "あ",
99
+ "り": "い",
100
+ "る": "う",
101
+ "れ": "え",
102
+ "ろ": "う",
103
+ "わ": "あ",
104
+ "を": "う",
105
+ }
106
+ new_text = []
107
+ for i, char in enumerate(hiragana_text):
108
+ if char == "ー" and i > 0:
109
+ prev_char = new_text[-1]
110
+ if prev_char in yoon_map:
111
+ prev_char = yoon_map[prev_char]
112
+ new_text.append(vowels.get(prev_char, prev_char))
113
+ else:
114
+ new_text.append(char)
115
+ return "".join(new_text)
116
+
117
+
118
+ def to_kana(text: str) -> list[str]:
119
+ hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", ""))
120
+ hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ")
121
+ final_ls = []
122
+ for subword in hiragana_text_wl:
123
+ sl_prev = 0
124
+ for i in range(len(subword) - 1):
125
+ if sl_prev >= len(subword) - 1:
126
+ break
127
+ sl = sl_prev + 1
128
+ if subword[sl] in yoon_map:
129
+ final_ls.append(subword[sl_prev : sl + 1])
130
+ sl_prev += 2
131
+ else:
132
+ final_ls.append(subword[sl_prev])
133
+ sl_prev += 1
134
+ final_ls.append(subword[sl_prev])
135
+ return final_ls
136
+
137
+
138
+ def kana_to_phonemes_openjtalk(kana: str) -> list[str]:
139
+ import pyopenjtalk
140
+
141
+ with warnings.catch_warnings(record=True) as w:
142
+ warnings.simplefilter("always")
143
+ # add space between each character
144
+ kana = " ".join(list(kana))
145
+ # phones is a str object separated by space
146
+ phones = pyopenjtalk.g2p(kana, kana=False)
147
+ if len(w) > 0:
148
+ for warning in w:
149
+ if "No phoneme" in str(warning.message):
150
+ raise ValueError(f"No phoneme found for {kana}. {warning.message}")
151
+ phones = phones.split(" ")
152
+ return phones
153
+
154
+
155
+ def pinyin_to_phonemes_opencpop(pinyin: str) -> list[str]:
156
+ pinyin = pinyin.lower()
157
+ if pinyin in ace_phonemes_zh_plan["dict"]:
158
+ phns = ace_phonemes_zh_plan["dict"][pinyin]
159
+ return phns
160
+ elif pinyin in ace_phonemes_zh_plan["syllable_alias"]:
161
+ phns = ace_phonemes_zh_plan["dict"][
162
+ ace_phonemes_zh_plan["syllable_alias"][pinyin]
163
+ ]
164
+ return phns
165
+ else:
166
+ raise ValueError(f"{pinyin} not registered in Opencpop phoneme dict")
167
+
168
+
169
+ def pinyin_to_phonemes_ace(pinyin: str) -> list[str]:
170
+ pinyin = pinyin.lower()
171
+ if pinyin in PINYIN_DICT:
172
+ phns = PINYIN_DICT[pinyin]
173
+ return phns
174
+ else:
175
+ raise ValueError(f"{pinyin} not registered in ACE phoneme dict")
{resource → modules/utils/resources}/all_plans.json RENAMED
File without changes
{resource → modules/utils/resources}/pinyin_dict.py RENAMED
File without changes
modules/utils/text_normalize.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Optional
3
+
4
+
5
+ def remove_non_zh_jp(text: str) -> str:
6
+ pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef]"
7
+ return re.sub(pattern, "", text)
8
+
9
+
10
+ def truncate_sentences(text: str, max_sentences: int) -> str:
11
+ sentences = re.split(r"(?<=[。!?])", text)
12
+ return "".join(sentences[:max_sentences]).strip()
13
+
14
+
15
+ def clean_llm_output(
16
+ text: str,
17
+ max_sentences: Optional[int] = 2,
18
+ seg_syb: str = " ",
19
+ language: str = "mandarin",
20
+ ) -> str:
21
+ if language not in ["mandarin", "japanese"]:
22
+ raise NotImplementedError(f"Unsupported language: {language}")
23
+ text = text.strip()
24
+ if max_sentences is not None:
25
+ text = truncate_sentences(text, max_sentences)
26
+ text = remove_non_zh_jp(text)
27
+ text = re.sub(r"[^\w\s\u4e00-\u9fff]", " ", text) # Remove punctuation
28
+ text = re.sub(r"\s+", " ", text) # Normalize whitespace
29
+ text = text.replace("\n", seg_syb)
30
+ text = text.replace(" ", seg_syb)
31
+ return text
offline_process/create_features.py DELETED
@@ -1,71 +0,0 @@
1
- from datasets import load_dataset, concatenate_datasets
2
-
3
- ds = load_dataset("espnet/ace-kising-segments", cache_dir="cache")
4
-
5
- combined = concatenate_datasets([ds["train"], ds["validation"], ds["test"]])
6
-
7
- # 2. filter rows by singer: baber
8
- combined = combined.filter(lambda x: x["singer"] == "barber")
9
-
10
- # 3. create a new column, which counts the nonzero numbers in the list in the note_midi column
11
- combined = combined.map(
12
- lambda x: {
13
- "note_midi_length": len([n for n in x["note_midi"] if n != 0]),
14
- "lyric_word_length": len(
15
- [word for word in x["note_lyrics"] if word not in ["<AP>", "<SP>", "-"]]
16
- ), # counts the number of actual words (or characters for, e.g., Chinese/Japanese)
17
- }
18
- )
19
- combined = combined.map(
20
- lambda x: {
21
- "lyric_word_length": len(
22
- [word for word in x["note_lyrics"] if word not in ["<AP>", "<SP>", "-"]]
23
- )
24
- } # counts the number of actual words (or characters for, e.g., Chinese/Japanese)
25
- )
26
-
27
- # 4. sort by segment_id
28
- combined = combined.sort("segment_id")
29
-
30
- # 5. iterate over rows
31
- prev_songid = None
32
- prev_song_segment_id = None
33
- song2note_lengths = {}
34
- song2word_lengths = {}
35
- for row in combined:
36
- # segment_id: kising_barber_{songid}_{song_segment_id}
37
- _, _, songid, song_segment_id = row["segment_id"].split("_")
38
- if prev_songid != songid:
39
- if prev_songid is not None:
40
- assert (
41
- song_segment_id == "001"
42
- ), f"prev_songid: {prev_songid}, songid: {songid}, song_segment_id: {song_segment_id}"
43
- song2note_lengths[f"kising_{songid}"] = [row["note_midi_length"]]
44
- song2word_lengths[f"kising_{songid}"] = [row["lyric_word_length"]]
45
- else:
46
- assert (
47
- int(song_segment_id) >= int(prev_song_segment_id) + 1
48
- ), f"prev_song_segment_id: {prev_song_segment_id}, song_segment_id: {song_segment_id}"
49
- song2note_lengths[f"kising_{songid}"].append(row["note_midi_length"])
50
- song2word_lengths[f"kising_{songid}"].append(row["lyric_word_length"])
51
- prev_songid = songid
52
- prev_song_segment_id = song_segment_id
53
-
54
- # 6. write to json
55
- import json
56
-
57
- with open("data/song2note_lengths.json", "w") as f:
58
- json.dump(song2note_lengths, f, indent=4)
59
-
60
- with open("data/song2word_lengths.json", "w") as f:
61
- json.dump(song2word_lengths, f, indent=4)
62
-
63
- # 7. push score segments to hub
64
- # remove audio and singer columns
65
- combined = combined.remove_columns(["audio", "singer"])
66
- # replace kising_barber_ with kising_
67
- combined = combined.map(
68
- lambda x: {"segment_id": x["segment_id"].replace("kising_barber_", "kising_")}
69
- )
70
- # upload to hub
71
- combined.push_to_hub("jhansss/kising_score_segments")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
path.sh DELETED
@@ -1,3 +0,0 @@
1
- #!/bin/bash
2
-
3
- . ~/workspace/SingingSDS/activate_python.sh
 
 
 
 
pipeline.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import time
3
+ import librosa
4
+
5
+ from modules.asr import get_asr_model
6
+ from modules.llm import get_llm_model
7
+ from modules.svs import get_svs_model
8
+ from evaluation.svs_eval import load_evaluators, run_evaluation
9
+ from modules.melody import MelodyController
10
+ from modules.utils.text_normalize import clean_llm_output
11
+
12
+
13
+ class SingingDialoguePipeline:
14
+ def __init__(self, config: dict):
15
+ if "device" in config:
16
+ self.device = config["device"]
17
+ else:
18
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ self.cache_dir = config["cache_dir"]
20
+ self.asr = get_asr_model(
21
+ config["asr_model"], device=self.device, cache_dir=self.cache_dir
22
+ )
23
+ self.llm = get_llm_model(
24
+ config["llm_model"], device=self.device, cache_dir=self.cache_dir
25
+ )
26
+ self.svs = get_svs_model(
27
+ config["svs_model"], device=self.device, cache_dir=self.cache_dir
28
+ )
29
+ self.melody_controller = MelodyController(
30
+ config["melody_source"], self.cache_dir
31
+ )
32
+ self.track_latency = config.get("track_latency", False)
33
+ self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
34
+
35
+ def set_asr_model(self, asr_model: str):
36
+ self.asr = get_asr_model(
37
+ asr_model, device=self.device, cache_dir=self.cache_dir
38
+ )
39
+
40
+ def set_llm_model(self, llm_model: str):
41
+ self.llm = get_llm_model(
42
+ llm_model, device=self.device, cache_dir=self.cache_dir
43
+ )
44
+
45
+ def set_svs_model(self, svs_model: str):
46
+ self.svs = get_svs_model(
47
+ svs_model, device=self.device, cache_dir=self.cache_dir
48
+ )
49
+
50
+ def set_melody_controller(self, melody_source: str):
51
+ self.melody_controller = MelodyController(melody_source, self.cache_dir)
52
+
53
+ def run(
54
+ self,
55
+ audio_path,
56
+ language,
57
+ prompt_template,
58
+ svs_inference_kwargs,
59
+ max_new_tokens=100,
60
+ ):
61
+ if self.track_latency:
62
+ asr_start_time = time.time()
63
+ audio_array, audio_sample_rate = librosa.load(audio_path, sr=16000)
64
+ asr_result = self.asr.transcribe(
65
+ audio_array, audio_sample_rate=audio_sample_rate, language=language
66
+ )
67
+ if self.track_latency:
68
+ asr_end_time = time.time()
69
+ asr_latency = asr_end_time - asr_start_time
70
+ melody_prompt = self.melody_controller.get_melody_constraints()
71
+ prompt = prompt_template.format(melody_prompt, asr_result)
72
+ if self.track_latency:
73
+ llm_start_time = time.time()
74
+ output = self.llm.generate(prompt, max_new_tokens=max_new_tokens)
75
+ if self.track_latency:
76
+ llm_end_time = time.time()
77
+ llm_latency = llm_end_time - llm_start_time
78
+ print(f"llm output: {output}确认一下是不是不含prompt的")
79
+ llm_response = clean_llm_output(output, language=language)
80
+ score = self.melody_controller.generate_score(llm_response, language)
81
+ if self.track_latency:
82
+ svs_start_time = time.time()
83
+ singing_audio, sample_rate = self.svs.synthesize(
84
+ score, language=language, **svs_inference_kwargs
85
+ )
86
+ if self.track_latency:
87
+ svs_end_time = time.time()
88
+ svs_latency = svs_end_time - svs_start_time
89
+ results = {
90
+ "asr_text": asr_result,
91
+ "llm_text": llm_response,
92
+ "svs_audio": (singing_audio, sample_rate),
93
+ }
94
+ if self.track_latency:
95
+ results["metrics"].update({
96
+ "asr_latency": asr_latency,
97
+ "llm_latency": llm_latency,
98
+ "svs_latency": svs_latency,
99
+ })
100
+ return results
101
+
102
+ def evaluate(self, audio, sample_rate):
103
+ return run_evaluation(audio, sample_rate, self.evaluators)
{resource → resources}/__init__.py RENAMED
File without changes
resources/all_plans.json ADDED
The diff for this file is too large to render. See raw diff
 
{resource → resources}/midi-note.scp RENAMED
File without changes
resources/pinyin_dict.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from Opencpop's pinyin to phoneme mapping table:
2
+ # https://wenet.org.cn/opencpop/resources/annotationformat/
3
+ PINYIN_DICT = {
4
+ "a": ("a",),
5
+ "ai": ("ai",),
6
+ "an": ("an",),
7
+ "ang": ("ang",),
8
+ "ao": ("ao",),
9
+ "ba": ("b", "a"),
10
+ "bai": ("b", "ai"),
11
+ "ban": ("b", "an"),
12
+ "bang": ("b", "ang"),
13
+ "bao": ("b", "ao"),
14
+ "bei": ("b", "ei"),
15
+ "ben": ("b", "en"),
16
+ "beng": ("b", "eng"),
17
+ "bi": ("b", "i"),
18
+ "bian": ("b", "ian"),
19
+ "biao": ("b", "iao"),
20
+ "bie": ("b", "ie"),
21
+ "bin": ("b", "in"),
22
+ "bing": ("b", "ing"),
23
+ "bo": ("b", "o"),
24
+ "bu": ("b", "u"),
25
+ "ca": ("c", "a"),
26
+ "cai": ("c", "ai"),
27
+ "can": ("c", "an"),
28
+ "cang": ("c", "ang"),
29
+ "cao": ("c", "ao"),
30
+ "ce": ("c", "e"),
31
+ "cei": ("c", "ei"),
32
+ "cen": ("c", "en"),
33
+ "ceng": ("c", "eng"),
34
+ "cha": ("ch", "a"),
35
+ "chai": ("ch", "ai"),
36
+ "chan": ("ch", "an"),
37
+ "chang": ("ch", "ang"),
38
+ "chao": ("ch", "ao"),
39
+ "che": ("ch", "e"),
40
+ "chen": ("ch", "en"),
41
+ "cheng": ("ch", "eng"),
42
+ "chi": ("ch", "i"),
43
+ "chong": ("ch", "ong"),
44
+ "chou": ("ch", "ou"),
45
+ "chu": ("ch", "u"),
46
+ "chua": ("ch", "ua"),
47
+ "chuai": ("ch", "uai"),
48
+ "chuan": ("ch", "uan"),
49
+ "chuang": ("ch", "uang"),
50
+ "chui": ("ch", "ui"),
51
+ "chun": ("ch", "un"),
52
+ "chuo": ("ch", "uo"),
53
+ "ci": ("c", "i"),
54
+ "cong": ("c", "ong"),
55
+ "cou": ("c", "ou"),
56
+ "cu": ("c", "u"),
57
+ "cuan": ("c", "uan"),
58
+ "cui": ("c", "ui"),
59
+ "cun": ("c", "un"),
60
+ "cuo": ("c", "uo"),
61
+ "da": ("d", "a"),
62
+ "dai": ("d", "ai"),
63
+ "dan": ("d", "an"),
64
+ "dang": ("d", "ang"),
65
+ "dao": ("d", "ao"),
66
+ "de": ("d", "e"),
67
+ "dei": ("d", "ei"),
68
+ "den": ("d", "en"),
69
+ "deng": ("d", "eng"),
70
+ "di": ("d", "i"),
71
+ "dia": ("d", "ia"),
72
+ "dian": ("d", "ian"),
73
+ "diao": ("d", "iao"),
74
+ "die": ("d", "ie"),
75
+ "ding": ("d", "ing"),
76
+ "diu": ("d", "iu"),
77
+ "dong": ("d", "ong"),
78
+ "dou": ("d", "ou"),
79
+ "du": ("d", "u"),
80
+ "duan": ("d", "uan"),
81
+ "dui": ("d", "ui"),
82
+ "dun": ("d", "un"),
83
+ "duo": ("d", "uo"),
84
+ "e": ("e",),
85
+ "ei": ("ei",),
86
+ "en": ("en",),
87
+ "eng": ("eng",),
88
+ "er": ("er",),
89
+ "fa": ("f", "a"),
90
+ "fan": ("f", "an"),
91
+ "fang": ("f", "ang"),
92
+ "fei": ("f", "ei"),
93
+ "fen": ("f", "en"),
94
+ "feng": ("f", "eng"),
95
+ "fo": ("f", "o"),
96
+ "fou": ("f", "ou"),
97
+ "fu": ("f", "u"),
98
+ "ga": ("g", "a"),
99
+ "gai": ("g", "ai"),
100
+ "gan": ("g", "an"),
101
+ "gang": ("g", "ang"),
102
+ "gao": ("g", "ao"),
103
+ "ge": ("g", "e"),
104
+ "gei": ("g", "ei"),
105
+ "gen": ("g", "en"),
106
+ "geng": ("g", "eng"),
107
+ "gong": ("g", "ong"),
108
+ "gou": ("g", "ou"),
109
+ "gu": ("g", "u"),
110
+ "gua": ("g", "ua"),
111
+ "guai": ("g", "uai"),
112
+ "guan": ("g", "uan"),
113
+ "guang": ("g", "uang"),
114
+ "gui": ("g", "ui"),
115
+ "gun": ("g", "un"),
116
+ "guo": ("g", "uo"),
117
+ "ha": ("h", "a"),
118
+ "hai": ("h", "ai"),
119
+ "han": ("h", "an"),
120
+ "hang": ("h", "ang"),
121
+ "hao": ("h", "ao"),
122
+ "he": ("h", "e"),
123
+ "hei": ("h", "ei"),
124
+ "hen": ("h", "en"),
125
+ "heng": ("h", "eng"),
126
+ "hm": ("h", "m"),
127
+ "hng": ("h", "ng"),
128
+ "hong": ("h", "ong"),
129
+ "hou": ("h", "ou"),
130
+ "hu": ("h", "u"),
131
+ "hua": ("h", "ua"),
132
+ "huai": ("h", "uai"),
133
+ "huan": ("h", "uan"),
134
+ "huang": ("h", "uang"),
135
+ "hui": ("h", "ui"),
136
+ "hun": ("h", "un"),
137
+ "huo": ("h", "uo"),
138
+ "ji": ("j", "i"),
139
+ "jia": ("j", "ia"),
140
+ "jian": ("j", "ian"),
141
+ "jiang": ("j", "iang"),
142
+ "jiao": ("j", "iao"),
143
+ "jie": ("j", "ie"),
144
+ "jin": ("j", "in"),
145
+ "jing": ("j", "ing"),
146
+ "jiong": ("j", "iong"),
147
+ "jiu": ("j", "iu"),
148
+ "ju": ("j", "v"),
149
+ "juan": ("j", "van"),
150
+ "jue": ("j", "ve"),
151
+ "jun": ("j", "vn"),
152
+ "ka": ("k", "a"),
153
+ "kai": ("k", "ai"),
154
+ "kan": ("k", "an"),
155
+ "kang": ("k", "ang"),
156
+ "kao": ("k", "ao"),
157
+ "ke": ("k", "e"),
158
+ "kei": ("k", "ei"),
159
+ "ken": ("k", "en"),
160
+ "keng": ("k", "eng"),
161
+ "kong": ("k", "ong"),
162
+ "kou": ("k", "ou"),
163
+ "ku": ("k", "u"),
164
+ "kua": ("k", "ua"),
165
+ "kuai": ("k", "uai"),
166
+ "kuan": ("k", "uan"),
167
+ "kuang": ("k", "uang"),
168
+ "kui": ("k", "ui"),
169
+ "kun": ("k", "un"),
170
+ "kuo": ("k", "uo"),
171
+ "la": ("l", "a"),
172
+ "lai": ("l", "ai"),
173
+ "lan": ("l", "an"),
174
+ "lang": ("l", "ang"),
175
+ "lao": ("l", "ao"),
176
+ "le": ("l", "e"),
177
+ "lei": ("l", "ei"),
178
+ "leng": ("l", "eng"),
179
+ "li": ("l", "i"),
180
+ "lia": ("l", "ia"),
181
+ "lian": ("l", "ian"),
182
+ "liang": ("l", "iang"),
183
+ "liao": ("l", "iao"),
184
+ "lie": ("l", "ie"),
185
+ "lin": ("l", "in"),
186
+ "ling": ("l", "ing"),
187
+ "liu": ("l", "iu"),
188
+ "lo": ("l", "o"),
189
+ "long": ("l", "ong"),
190
+ "lou": ("l", "ou"),
191
+ "lu": ("l", "u"),
192
+ "luan": ("l", "uan"),
193
+ "lun": ("l", "un"),
194
+ "luo": ("l", "uo"),
195
+ "lv": ("l", "v"),
196
+ "lve": ("l", "ve"),
197
+ "m": ("m",),
198
+ "ma": ("m", "a"),
199
+ "mai": ("m", "ai"),
200
+ "man": ("m", "an"),
201
+ "mang": ("m", "ang"),
202
+ "mao": ("m", "ao"),
203
+ "me": ("m", "e"),
204
+ "mei": ("m", "ei"),
205
+ "men": ("m", "en"),
206
+ "meng": ("m", "eng"),
207
+ "mi": ("m", "i"),
208
+ "mian": ("m", "ian"),
209
+ "miao": ("m", "iao"),
210
+ "mie": ("m", "ie"),
211
+ "min": ("m", "in"),
212
+ "ming": ("m", "ing"),
213
+ "miu": ("m", "iu"),
214
+ "mo": ("m", "o"),
215
+ "mou": ("m", "ou"),
216
+ "mu": ("m", "u"),
217
+ "n": ("n",),
218
+ "na": ("n", "a"),
219
+ "nai": ("n", "ai"),
220
+ "nan": ("n", "an"),
221
+ "nang": ("n", "ang"),
222
+ "nao": ("n", "ao"),
223
+ "ne": ("n", "e"),
224
+ "nei": ("n", "ei"),
225
+ "nen": ("n", "en"),
226
+ "neng": ("n", "eng"),
227
+ "ng": ("n", "g"),
228
+ "ni": ("n", "i"),
229
+ "nian": ("n", "ian"),
230
+ "niang": ("n", "iang"),
231
+ "niao": ("n", "iao"),
232
+ "nie": ("n", "ie"),
233
+ "nin": ("n", "in"),
234
+ "ning": ("n", "ing"),
235
+ "niu": ("n", "iu"),
236
+ "nong": ("n", "ong"),
237
+ "nou": ("n", "ou"),
238
+ "nu": ("n", "u"),
239
+ "nuan": ("n", "uan"),
240
+ "nun": ("n", "un"),
241
+ "nuo": ("n", "uo"),
242
+ "nv": ("n", "v"),
243
+ "nve": ("n", "ve"),
244
+ "o": ("o",),
245
+ "ou": ("ou",),
246
+ "pa": ("p", "a"),
247
+ "pai": ("p", "ai"),
248
+ "pan": ("p", "an"),
249
+ "pang": ("p", "ang"),
250
+ "pao": ("p", "ao"),
251
+ "pei": ("p", "ei"),
252
+ "pen": ("p", "en"),
253
+ "peng": ("p", "eng"),
254
+ "pi": ("p", "i"),
255
+ "pian": ("p", "ian"),
256
+ "piao": ("p", "iao"),
257
+ "pie": ("p", "ie"),
258
+ "pin": ("p", "in"),
259
+ "ping": ("p", "ing"),
260
+ "po": ("p", "o"),
261
+ "pou": ("p", "ou"),
262
+ "pu": ("p", "u"),
263
+ "qi": ("q", "i"),
264
+ "qia": ("q", "ia"),
265
+ "qian": ("q", "ian"),
266
+ "qiang": ("q", "iang"),
267
+ "qiao": ("q", "iao"),
268
+ "qie": ("q", "ie"),
269
+ "qin": ("q", "in"),
270
+ "qing": ("q", "ing"),
271
+ "qiong": ("q", "iong"),
272
+ "qiu": ("q", "iu"),
273
+ "qu": ("q", "v"),
274
+ "quan": ("q", "van"),
275
+ "que": ("q", "ve"),
276
+ "qun": ("q", "vn"),
277
+ "ran": ("r", "an"),
278
+ "rang": ("r", "ang"),
279
+ "rao": ("r", "ao"),
280
+ "re": ("r", "e"),
281
+ "ren": ("r", "en"),
282
+ "reng": ("r", "eng"),
283
+ "ri": ("r", "i"),
284
+ "rong": ("r", "ong"),
285
+ "rou": ("r", "ou"),
286
+ "ru": ("r", "u"),
287
+ "rua": ("r", "ua"),
288
+ "ruan": ("r", "uan"),
289
+ "rui": ("r", "ui"),
290
+ "run": ("r", "un"),
291
+ "ruo": ("r", "uo"),
292
+ "sa": ("s", "a"),
293
+ "sai": ("s", "ai"),
294
+ "san": ("s", "an"),
295
+ "sang": ("s", "ang"),
296
+ "sao": ("s", "ao"),
297
+ "se": ("s", "e"),
298
+ "sen": ("s", "en"),
299
+ "seng": ("s", "eng"),
300
+ "sha": ("sh", "a"),
301
+ "shai": ("sh", "ai"),
302
+ "shan": ("sh", "an"),
303
+ "shang": ("sh", "ang"),
304
+ "shao": ("sh", "ao"),
305
+ "she": ("sh", "e"),
306
+ "shei": ("sh", "ei"),
307
+ "shen": ("sh", "en"),
308
+ "sheng": ("sh", "eng"),
309
+ "shi": ("sh", "i"),
310
+ "shou": ("sh", "ou"),
311
+ "shu": ("sh", "u"),
312
+ "shua": ("sh", "ua"),
313
+ "shuai": ("sh", "uai"),
314
+ "shuan": ("sh", "uan"),
315
+ "shuang": ("sh", "uang"),
316
+ "shui": ("sh", "ui"),
317
+ "shun": ("sh", "un"),
318
+ "shuo": ("sh", "uo"),
319
+ "si": ("s", "i"),
320
+ "song": ("s", "ong"),
321
+ "sou": ("s", "ou"),
322
+ "su": ("s", "u"),
323
+ "suan": ("s", "uan"),
324
+ "sui": ("s", "ui"),
325
+ "sun": ("s", "un"),
326
+ "suo": ("s", "uo"),
327
+ "ta": ("t", "a"),
328
+ "tai": ("t", "ai"),
329
+ "tan": ("t", "an"),
330
+ "tang": ("t", "ang"),
331
+ "tao": ("t", "ao"),
332
+ "te": ("t", "e"),
333
+ "tei": ("t", "ei"),
334
+ "teng": ("t", "eng"),
335
+ "ti": ("t", "i"),
336
+ "tian": ("t", "ian"),
337
+ "tiao": ("t", "iao"),
338
+ "tie": ("t", "ie"),
339
+ "ting": ("t", "ing"),
340
+ "tong": ("t", "ong"),
341
+ "tou": ("t", "ou"),
342
+ "tu": ("t", "u"),
343
+ "tuan": ("t", "uan"),
344
+ "tui": ("t", "ui"),
345
+ "tun": ("t", "un"),
346
+ "tuo": ("t", "uo"),
347
+ "wa": ("w", "a"),
348
+ "wai": ("w", "ai"),
349
+ "wan": ("w", "an"),
350
+ "wang": ("w", "ang"),
351
+ "wei": ("w", "ei"),
352
+ "wen": ("w", "en"),
353
+ "weng": ("w", "eng"),
354
+ "wo": ("w", "o"),
355
+ "wu": ("w", "u"),
356
+ "xi": ("x", "i"),
357
+ "xia": ("x", "ia"),
358
+ "xian": ("x", "ian"),
359
+ "xiang": ("x", "iang"),
360
+ "xiao": ("x", "iao"),
361
+ "xie": ("x", "ie"),
362
+ "xin": ("x", "in"),
363
+ "xing": ("x", "ing"),
364
+ "xiong": ("x", "iong"),
365
+ "xiu": ("x", "iu"),
366
+ "xu": ("x", "v"),
367
+ "xuan": ("x", "van"),
368
+ "xue": ("x", "ve"),
369
+ "xun": ("x", "vn"),
370
+ "ya": ("y", "a"),
371
+ "yan": ("y", "an"),
372
+ "yang": ("y", "ang"),
373
+ "yao": ("y", "ao"),
374
+ "ye": ("y", "e"),
375
+ "yi": ("y", "i"),
376
+ "yin": ("y", "in"),
377
+ "ying": ("y", "ing"),
378
+ "yo": ("y", "o"),
379
+ "yong": ("y", "ong"),
380
+ "you": ("y", "ou"),
381
+ "yu": ("y", "v"),
382
+ "yuan": ("y", "van"),
383
+ "yue": ("y", "ve"),
384
+ "yun": ("y", "vn"),
385
+ "za": ("z", "a"),
386
+ "zai": ("z", "ai"),
387
+ "zan": ("z", "an"),
388
+ "zang": ("z", "ang"),
389
+ "zao": ("z", "ao"),
390
+ "ze": ("z", "e"),
391
+ "zei": ("z", "ei"),
392
+ "zen": ("z", "en"),
393
+ "zeng": ("z", "eng"),
394
+ "zha": ("zh", "a"),
395
+ "zhai": ("zh", "ai"),
396
+ "zhan": ("zh", "an"),
397
+ "zhang": ("zh", "ang"),
398
+ "zhao": ("zh", "ao"),
399
+ "zhe": ("zh", "e"),
400
+ "zhei": ("zh", "ei"),
401
+ "zhen": ("zh", "en"),
402
+ "zheng": ("zh", "eng"),
403
+ "zhi": ("zh", "i"),
404
+ "zhong": ("zh", "ong"),
405
+ "zhou": ("zh", "ou"),
406
+ "zhu": ("zh", "u"),
407
+ "zhua": ("zh", "ua"),
408
+ "zhuai": ("zh", "uai"),
409
+ "zhuan": ("zh", "uan"),
410
+ "zhuang": ("zh", "uang"),
411
+ "zhui": ("zh", "ui"),
412
+ "zhun": ("zh", "un"),
413
+ "zhuo": ("zh", "uo"),
414
+ "zi": ("z", "i"),
415
+ "zong": ("z", "ong"),
416
+ "zou": ("z", "ou"),
417
+ "zu": ("z", "u"),
418
+ "zuan": ("z", "uan"),
419
+ "zui": ("z", "ui"),
420
+ "zun": ("z", "un"),
421
+ "zuo": ("z", "uo"),
422
+ }
423
+
{resource → resources}/singer/singer_embedding_ace-1.npy RENAMED
File without changes
{resource → resources}/singer/singer_embedding_ace-10.npy RENAMED
File without changes
{resource → resources}/singer/singer_embedding_ace-11.npy RENAMED
File without changes
{resource → resources}/singer/singer_embedding_ace-12.npy RENAMED
File without changes
{resource → resources}/singer/singer_embedding_ace-13.npy RENAMED
File without changes
{resource → resources}/singer/singer_embedding_ace-14.npy RENAMED
File without changes
{resource → resources}/singer/singer_embedding_ace-15.npy RENAMED
File without changes
{resource → resources}/singer/singer_embedding_ace-16.npy RENAMED
File without changes
{resource → resources}/singer/singer_embedding_ace-17.npy RENAMED
File without changes
{resource → resources}/singer/singer_embedding_ace-18.npy RENAMED
File without changes
{resource → resources}/singer/singer_embedding_ace-19.npy RENAMED
File without changes