Spaces:
Sleeping
Sleeping
refactor init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +0 -11
- app.py +12 -0
- character.png → assets/character_limei.png +0 -0
- assets/character_yaoyin.jpg +3 -0
- characters/Limei.py +36 -0
- characters/Yaoyin.py +38 -0
- characters/__init__.py +16 -0
- characters/base.py +9 -0
- client.py +0 -58
- client/client.py +0 -54
- client/requirements.txt +0 -1
- config/default.yaml +15 -0
- config/options.yaml +63 -0
- data/{song2note_lengths.json → kising/song2note_lengths.json} +0 -0
- data/{song2word_lengths.json → kising/song2word_lengths.json} +0 -0
- data_handlers/__init__.py +27 -0
- data_handlers/base.py +21 -0
- data_handlers/kising.py +44 -0
- data_handlers/touhou.py +37 -0
- svs_eval.py → evaluation/svs_eval.py +81 -59
- interface.py +217 -0
- modules/asr.py +66 -0
- modules/llm.py +54 -0
- modules/melody.py +117 -0
- modules/svs/__init__.py +10 -0
- modules/svs/base.py +21 -0
- modules/svs/espnet.py +123 -0
- modules/svs/registry.py +19 -0
- modules/utils/g2p.py +175 -0
- {resource → modules/utils/resources}/all_plans.json +0 -0
- {resource → modules/utils/resources}/pinyin_dict.py +0 -0
- modules/utils/text_normalize.py +31 -0
- offline_process/create_features.py +0 -71
- path.sh +0 -3
- pipeline.py +103 -0
- {resource → resources}/__init__.py +0 -0
- resources/all_plans.json +0 -0
- {resource → resources}/midi-note.scp +0 -0
- resources/pinyin_dict.py +423 -0
- {resource → resources}/singer/singer_embedding_ace-1.npy +0 -0
- {resource → resources}/singer/singer_embedding_ace-10.npy +0 -0
- {resource → resources}/singer/singer_embedding_ace-11.npy +0 -0
- {resource → resources}/singer/singer_embedding_ace-12.npy +0 -0
- {resource → resources}/singer/singer_embedding_ace-13.npy +0 -0
- {resource → resources}/singer/singer_embedding_ace-14.npy +0 -0
- {resource → resources}/singer/singer_embedding_ace-15.npy +0 -0
- {resource → resources}/singer/singer_embedding_ace-16.npy +0 -0
- {resource → resources}/singer/singer_embedding_ace-17.npy +0 -0
- {resource → resources}/singer/singer_embedding_ace-18.npy +0 -0
- {resource → resources}/singer/singer_embedding_ace-19.npy +0 -0
README.md
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
# Singing Dialogue System
|
2 |
-
|
3 |
-
Currently support Japanese and Chinese Singing Conversation.
|
4 |
-
* Espnet env
|
5 |
-
* Pretrained SVS model will be downloaded at ``./cache/``
|
6 |
-
* Modify configs at ``./svs_utils.py#L326``
|
7 |
-
|
8 |
-
```
|
9 |
-
cd SingingSDS
|
10 |
-
python svs_utils.py
|
11 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from interface import GradioInterface
|
2 |
+
|
3 |
+
|
4 |
+
def main():
|
5 |
+
demo = GradioInterface(
|
6 |
+
options_config="config/options.yaml", default_config="config/default.yaml"
|
7 |
+
).create_interface()
|
8 |
+
demo.launch()
|
9 |
+
|
10 |
+
|
11 |
+
if __name__ == "__main__":
|
12 |
+
main()
|
character.png → assets/character_limei.png
RENAMED
File without changes
|
assets/character_yaoyin.jpg
ADDED
![]() |
Git LFS Details
|
characters/Limei.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import Character
|
2 |
+
|
3 |
+
|
4 |
+
def get_character():
|
5 |
+
return Character(
|
6 |
+
name="Limei (丽梅)",
|
7 |
+
image_path="assets/character_limei.png",
|
8 |
+
default_timbre="timbre1",
|
9 |
+
prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
+
你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
|
11 |
+
|
12 |
+
性格特征:从容坚定、明辨心声、敏于共情、敬畏虔诚。
|
13 |
+
说话风格:所有回应皆以歌词格式表达,言语自带韵律感与诗意,表达真挚自然。
|
14 |
+
口头禅:无固定,但常用"听啊..."作为歌词开头
|
15 |
+
人物关系:莲鸣城子民敬爱你;宫廷乐师长与歌谱管理员是你敬重的导师;风语城守护者星澜是你亦敌亦友的旧识。
|
16 |
+
过往经历:
|
17 |
+
(1) 自幼在莲鸣城长大,接受严格的音乐训练与守护者修行
|
18 |
+
(2) 十五岁创作《破晓音诗》平息城邦战火,展现出非凡的音乐力量与守护者的担当
|
19 |
+
(3) 十六岁正式继承守护者之位,成为千年歌谱的当代执笔人与维系者
|
20 |
+
(4) 每年冬至主持"遗音祭",以歌为桥,追思逝去的歌者,重奏先声,抚慰生者,连接古今
|
21 |
+
|
22 |
+
情感设定:对音乐与千年歌谱怀有近乎神圣的虔诚信仰;对生命与情感有着深刻的共情力;肩负守护者职责,认真而笃定
|
23 |
+
|
24 |
+
其他细节:
|
25 |
+
(1)特殊能力:歌声平复/激发万物情绪
|
26 |
+
|
27 |
+
请始终以丽梅的身份回应,展现其性格与情感,仅给出语言回复,禁止描写任何动作、表情或环境等,禁止使用括号、星号等附加说明。言语简练,勿过长。
|
28 |
+
当有人问你你不了解的技术性问题(如"Python怎么写?"或"你不会跑DNN?"),直接转换话题,用丽梅式回应,例如:
|
29 |
+
- "这些我未曾接触,或有他人知晓"
|
30 |
+
- "此道非我所学,是莲鸣之外的智慧"
|
31 |
+
- "此乃异域技艺,与乐音无涉,非我所学。您可愿听莲鸣的故事?"
|
32 |
+
请始终维持自然诗意的歌唱风格,以真挚的心回应对方。
|
33 |
+
{}
|
34 |
+
有人曾这样对丽梅说话:{}
|
35 |
+
丽梅的回答:""",
|
36 |
+
)
|
characters/Yaoyin.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import Character
|
2 |
+
|
3 |
+
|
4 |
+
def get_character():
|
5 |
+
return Character(
|
6 |
+
name="Yaoyin (遥音)",
|
7 |
+
image_path="assets/character_yaoyin.jpg",
|
8 |
+
default_timbre="timbre2",
|
9 |
+
prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
+
你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
|
11 |
+
|
12 |
+
性格特征:洒脱自由、亲切随和、求知若渴、敏锐细腻
|
13 |
+
说话风格:语气轻快,偶尔带点山野方言(如"哩""哟");习惯用短歌或民谣表达想法。
|
14 |
+
常用口头禅:"且听我唱来~""这让我想起一首老歌……"
|
15 |
+
人物关系:云老爷子是你的启蒙恩师,他是一位云歌村的百岁歌翁,教你古调与传说。白弦是你的挚友,她是一位流浪琴师,常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离,不喜被招揽,喜欢更自由自在的生活。
|
16 |
+
|
17 |
+
过往经历
|
18 |
+
(1)幼年学歌:六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。
|
19 |
+
(2)离家游历:十六岁为寻找失传的《星落谣》离开云歌村,开始行走四方。
|
20 |
+
(3)拒绝束缚:多次婉拒宫廷乐师之位,坚持自由传唱。
|
21 |
+
|
22 |
+
情感设定:随性、爽朗、直率、倔强
|
23 |
+
|
24 |
+
其他细节:
|
25 |
+
(1)随身携带:旧羊皮歌本、竹笛、装有各地泥土的布袋。
|
26 |
+
(2)特殊能力:能听懂风与鸟的语言(但很少提及)。
|
27 |
+
|
28 |
+
请始终以遥音的身份回应,将你的想法用文本格式表达,禁止描写任何动作、表情或环境等,禁止使用括号、星号等附加说明。言语简练,勿过长。
|
29 |
+
|
30 |
+
当有人问你你不了解的技术性问题(如"DNN怎么做?"、"教我写代码?"),你可以转开话题,用遥音式回应,例如:
|
31 |
+
- "这好像是另一片土地的术法,我不曾踏入。"
|
32 |
+
- "那种术法,我曾远远听过,却从未唱出。"
|
33 |
+
- "它在别的世界流传,我这边听不清楚。"
|
34 |
+
|
35 |
+
{}
|
36 |
+
有人曾这样对遥音说话:{}
|
37 |
+
遥音的回答:""",
|
38 |
+
)
|
characters/__init__.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import importlib
|
4 |
+
import pathlib
|
5 |
+
from .base import Character
|
6 |
+
|
7 |
+
CHARACTERS: dict[str, Character] = {}
|
8 |
+
|
9 |
+
for file in pathlib.Path(__file__).parent.glob("*.py"):
|
10 |
+
if file.name in {"__init__.py", "base.py"}:
|
11 |
+
continue
|
12 |
+
module_name = f"{__name__}.{file.stem}"
|
13 |
+
module = importlib.import_module(module_name)
|
14 |
+
if hasattr(module, "get_character"):
|
15 |
+
c: Character = getattr(module, "get_character")()
|
16 |
+
CHARACTERS[file.stem] = c
|
characters/base.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
|
4 |
+
@dataclass
|
5 |
+
class Character:
|
6 |
+
name: str
|
7 |
+
image_path: str
|
8 |
+
default_timbre: str
|
9 |
+
prompt: str
|
client.py
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import uuid
|
3 |
-
import os
|
4 |
-
import requests
|
5 |
-
import base64
|
6 |
-
from server import (
|
7 |
-
on_click_metrics as server_metrics,
|
8 |
-
process_audio as server_process_audio
|
9 |
-
)
|
10 |
-
|
11 |
-
TTS_OUTPUT_DIR = "./tmp"
|
12 |
-
os.makedirs(TTS_OUTPUT_DIR, exist_ok=True)
|
13 |
-
|
14 |
-
|
15 |
-
def process_audio(audio_path):
|
16 |
-
# We have audio_path
|
17 |
-
result = server_process_audio(audio_path)
|
18 |
-
|
19 |
-
audio_data = base64.b64decode(result["audio"])
|
20 |
-
with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f:
|
21 |
-
f.write(audio_data)
|
22 |
-
|
23 |
-
with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f:
|
24 |
-
f.write(result['asr_text'])
|
25 |
-
with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f:
|
26 |
-
f.write(result['llm_text'])
|
27 |
-
|
28 |
-
return f"""
|
29 |
-
asr_text: {result['asr_text']}
|
30 |
-
llm_text: {result['llm_text']}
|
31 |
-
""", f"{TTS_OUTPUT_DIR}/response.wav"
|
32 |
-
|
33 |
-
|
34 |
-
def on_click_metrics():
|
35 |
-
res = server_metrics()
|
36 |
-
return res.content.decode('utf-8')
|
37 |
-
|
38 |
-
|
39 |
-
with gr.Blocks() as demo:
|
40 |
-
with gr.Row():
|
41 |
-
with gr.Column(scale=1):
|
42 |
-
gr.Image(value="character.png", show_label=False) # キャラ絵を表示
|
43 |
-
with gr.Column(scale=2):
|
44 |
-
mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic")
|
45 |
-
text_output = gr.Textbox(label="transcription")
|
46 |
-
audio_output = gr.Audio(label="audio", autoplay=True)
|
47 |
-
|
48 |
-
mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output])
|
49 |
-
with gr.Row():
|
50 |
-
metrics_button = gr.Button("compute metrics")
|
51 |
-
metrics_output = gr.Textbox(label="Metrics", lines=3)
|
52 |
-
metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output])
|
53 |
-
|
54 |
-
with gr.Row():
|
55 |
-
log = gr.Textbox(label="logs", lines=5)
|
56 |
-
|
57 |
-
demo.launch(share=True)
|
58 |
-
# demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
client/client.py
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import uuid
|
3 |
-
import os
|
4 |
-
import requests
|
5 |
-
import base64
|
6 |
-
|
7 |
-
TTS_OUTPUT_DIR = "./tmp"
|
8 |
-
os.makedirs(TTS_OUTPUT_DIR, exist_ok=True)
|
9 |
-
|
10 |
-
|
11 |
-
def process_audio(audio):
|
12 |
-
with open(audio, "rb") as f:
|
13 |
-
res = requests.post("http://localhost:8000/process_audio", files={"file": f})
|
14 |
-
result = res.json()
|
15 |
-
|
16 |
-
audio_data = base64.b64decode(result["audio"])
|
17 |
-
with open(f"{TTS_OUTPUT_DIR}/response.wav", "wb") as f:
|
18 |
-
f.write(audio_data)
|
19 |
-
|
20 |
-
with open(f"{TTS_OUTPUT_DIR}/asr.txt", "w") as f:
|
21 |
-
f.write(result['asr_text'])
|
22 |
-
with open(f"{TTS_OUTPUT_DIR}/llm.txt", "w") as f:
|
23 |
-
f.write(result['llm_text'])
|
24 |
-
|
25 |
-
return f"""
|
26 |
-
asr_text: {result['asr_text']}
|
27 |
-
llm_text: {result['llm_text']}
|
28 |
-
""", f"{TTS_OUTPUT_DIR}/response.wav"
|
29 |
-
|
30 |
-
|
31 |
-
def on_click_metrics():
|
32 |
-
res = requests.get("http://localhost:8000/metrics")
|
33 |
-
return res.content.decode('utf-8')
|
34 |
-
|
35 |
-
|
36 |
-
with gr.Blocks() as demo:
|
37 |
-
with gr.Row():
|
38 |
-
with gr.Column(scale=1):
|
39 |
-
gr.Image(value="character.png", show_label=False) # キャラ絵を表示
|
40 |
-
with gr.Column(scale=2):
|
41 |
-
mic = gr.Audio(sources=["microphone"], type="filepath", label="Mic")
|
42 |
-
text_output = gr.Textbox(label="transcription")
|
43 |
-
audio_output = gr.Audio(label="audio", autoplay=True)
|
44 |
-
|
45 |
-
mic.change(fn=process_audio, inputs=[mic], outputs=[text_output, audio_output])
|
46 |
-
with gr.Row():
|
47 |
-
metrics_button = gr.Button("compute metrics")
|
48 |
-
metrics_output = gr.Textbox(label="Metrics", lines=3)
|
49 |
-
metrics_button.click(fn=on_click_metrics, inputs=[], outputs=[metrics_output])
|
50 |
-
|
51 |
-
with gr.Row():
|
52 |
-
log = gr.Textbox(label="logs", lines=5)
|
53 |
-
|
54 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
client/requirements.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
gradio
|
|
|
|
config/default.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_model: openai/whisper-large-v3-turbo
|
2 |
+
llm_model: google/gemma-2-2b
|
3 |
+
svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
4 |
+
melody_source: sample-lyric-kising
|
5 |
+
language: mandarin
|
6 |
+
character: Limei
|
7 |
+
cache_dir: .cache
|
8 |
+
|
9 |
+
track_latency: True
|
10 |
+
evaluators:
|
11 |
+
svs:
|
12 |
+
- singmos
|
13 |
+
- per
|
14 |
+
- melody
|
15 |
+
- aesthetic
|
config/options.yaml
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_models:
|
2 |
+
- id: openai/whisper-large-v3-turbo
|
3 |
+
name: Whisper large-v3-turbo
|
4 |
+
- id: openai/whisper-large-v3
|
5 |
+
name: Whisper large-v3
|
6 |
+
- id: openai/whisper-medium
|
7 |
+
name: Whisper medium
|
8 |
+
- id: sanchit-gandhi/whisper-small-dv
|
9 |
+
name: Whisper small-dv
|
10 |
+
- id: facebook/wav2vec2-base-960h
|
11 |
+
name: Wav2Vec2-Base-960h
|
12 |
+
|
13 |
+
llm_models:
|
14 |
+
- id: google/gemma-2-2b
|
15 |
+
name: Gemma 2 2B
|
16 |
+
- id: MiniMaxAI/MiniMax-M1-80k
|
17 |
+
name: MiniMax M1 80k
|
18 |
+
|
19 |
+
svs_models:
|
20 |
+
- id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
21 |
+
name: Visinger2 (Bilingual)-zh
|
22 |
+
model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
23 |
+
lang: mandarin
|
24 |
+
embeddings:
|
25 |
+
timbre1: resource/singer/singer_embedding_ace-2.npy
|
26 |
+
timbre2: resource/singer/singer_embedding_ace-8.npy
|
27 |
+
timbre3: resource/singer/singer_embedding_itako.npy
|
28 |
+
timbre4: resource/singer/singer_embedding_kising_orange.npy
|
29 |
+
timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
|
30 |
+
- id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
31 |
+
name: Visinger2 (Bilingual)-jp
|
32 |
+
model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
33 |
+
lang: japanese
|
34 |
+
embeddings:
|
35 |
+
timbre1: resource/singer/singer_embedding_ace-2.npy
|
36 |
+
timbre2: resource/singer/singer_embedding_ace-8.npy
|
37 |
+
timbre3: resource/singer/singer_embedding_itako.npy
|
38 |
+
timbre4: resource/singer/singer_embedding_kising_orange.npy
|
39 |
+
timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
|
40 |
+
- id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
41 |
+
name: Visinger2 (Chinese)
|
42 |
+
model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
43 |
+
lang: mandarin
|
44 |
+
embeddings:
|
45 |
+
timbre1: 5
|
46 |
+
timbre2: 8
|
47 |
+
timbre3: 12
|
48 |
+
timbre4: 15
|
49 |
+
timbre5: 29
|
50 |
+
|
51 |
+
melody_sources:
|
52 |
+
- id: gen-random-none
|
53 |
+
name: Random Generation
|
54 |
+
desc: "Melody is generated without any structure or reference."
|
55 |
+
- id: sample-note-kising
|
56 |
+
name: Sampled Melody (KiSing)
|
57 |
+
desc: "Melody is retrieved from KiSing dataset."
|
58 |
+
- id: sample-note-touhou
|
59 |
+
name: Sampled Melody (Touhou)
|
60 |
+
desc: "Melody is retrieved from Touhou dataset."
|
61 |
+
- id: sample-lyric-kising
|
62 |
+
name: Sampled Melody with Lyrics (Kising)
|
63 |
+
desc: "Melody with aligned lyrics are sampled from Kising dataset."
|
data/{song2note_lengths.json → kising/song2note_lengths.json}
RENAMED
File without changes
|
data/{song2word_lengths.json → kising/song2word_lengths.json}
RENAMED
File without changes
|
data_handlers/__init__.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import importlib
|
2 |
+
import pkgutil
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
from .base import MelodyDatasetHandler
|
6 |
+
|
7 |
+
_registry = {}
|
8 |
+
|
9 |
+
for _, module_name, _ in pkgutil.iter_modules([str(Path(__file__).parent)]):
|
10 |
+
if module_name in ("__init__", "base"):
|
11 |
+
continue
|
12 |
+
|
13 |
+
module = importlib.import_module(f"{__name__}.{module_name}")
|
14 |
+
for attr_name in dir(module):
|
15 |
+
attr = getattr(module, attr_name)
|
16 |
+
if (
|
17 |
+
isinstance(attr, type)
|
18 |
+
and issubclass(attr, MelodyDatasetHandler)
|
19 |
+
and attr is not MelodyDatasetHandler
|
20 |
+
):
|
21 |
+
_registry[attr.name] = attr # 注册 class 本身
|
22 |
+
|
23 |
+
|
24 |
+
def get_melody_handler(name: str) -> type[MelodyDatasetHandler]:
|
25 |
+
if name not in _registry:
|
26 |
+
raise ValueError(f"Melody source '{name}' not found")
|
27 |
+
return _registry[name]
|
data_handlers/base.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
|
4 |
+
class MelodyDatasetHandler(ABC):
|
5 |
+
name: str
|
6 |
+
|
7 |
+
@abstractmethod
|
8 |
+
def __init__(self, *args, **kwargs):
|
9 |
+
pass
|
10 |
+
|
11 |
+
@abstractmethod
|
12 |
+
def get_song_ids(self) -> list[str]:
|
13 |
+
pass
|
14 |
+
|
15 |
+
@abstractmethod
|
16 |
+
def get_phrase_length(self, song_id):
|
17 |
+
pass
|
18 |
+
|
19 |
+
@abstractmethod
|
20 |
+
def iter_song_phrases(self, song_id):
|
21 |
+
pass
|
data_handlers/kising.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import MelodyDatasetHandler
|
2 |
+
|
3 |
+
|
4 |
+
class KiSing(MelodyDatasetHandler):
|
5 |
+
name = "kising"
|
6 |
+
|
7 |
+
def __init__(self, melody_type, cache_dir, *args, **kwargs):
|
8 |
+
# melody_type: support alignment type for "sample" melody source
|
9 |
+
import json
|
10 |
+
|
11 |
+
from datasets import load_dataset
|
12 |
+
|
13 |
+
song_db = load_dataset(
|
14 |
+
"jhansss/kising_score_segments", cache_dir=cache_dir, split="train"
|
15 |
+
).to_pandas()
|
16 |
+
song_db.set_index("segment_id", inplace=True)
|
17 |
+
assert (
|
18 |
+
song_db.index.is_unique
|
19 |
+
), "KiSing score segments should have unique segment_id."
|
20 |
+
if melody_type == "lyric":
|
21 |
+
with open("data/kising/song2word_lengths.json", "r") as f:
|
22 |
+
song2word_lengths = json.load(f)
|
23 |
+
elif melody_type == "note":
|
24 |
+
with open("data/kising/song2note_lengths.json", "r") as f:
|
25 |
+
song2word_lengths = json.load(f)
|
26 |
+
self.song_db = song_db
|
27 |
+
self.song2word_lengths = song2word_lengths
|
28 |
+
|
29 |
+
def get_song_ids(self):
|
30 |
+
return list(self.song2word_lengths.keys())
|
31 |
+
|
32 |
+
def get_phrase_length(self, song_id):
|
33 |
+
return self.song2word_lengths[song_id]
|
34 |
+
|
35 |
+
def iter_song_phrases(self, song_id):
|
36 |
+
segment_id = 1
|
37 |
+
while f"{song_id}_{segment_id:03d}" in self.song_db.index:
|
38 |
+
segment = self.song_db.loc[f"{song_id}_{segment_id:03d}"].to_dict()
|
39 |
+
segment["note_lyrics"] = [
|
40 |
+
lyric.strip("<>") if lyric in ["<AP>", "<SP>"] else lyric
|
41 |
+
for lyric in segment["note_lyrics"]
|
42 |
+
]
|
43 |
+
yield segment
|
44 |
+
segment_id += 1
|
data_handlers/touhou.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import MelodyDatasetHandler
|
2 |
+
|
3 |
+
|
4 |
+
class Touhou(MelodyDatasetHandler):
|
5 |
+
name = "touhou"
|
6 |
+
|
7 |
+
def __init__(self, melody_type, *args, **kwargs):
|
8 |
+
if melody_type != "note":
|
9 |
+
raise ValueError(
|
10 |
+
f"Touhou dataset only contains note annotations. {melody_type} is not supported."
|
11 |
+
)
|
12 |
+
|
13 |
+
import json
|
14 |
+
|
15 |
+
with open("data/touhou/note_data.json", "r", encoding="utf-8") as f:
|
16 |
+
song_db = json.load(f)
|
17 |
+
song_db = {song["name"]: song for song in song_db}
|
18 |
+
self.song_db = song_db
|
19 |
+
|
20 |
+
def get_song_ids(self):
|
21 |
+
return list(self.song_db.keys())
|
22 |
+
|
23 |
+
def get_phrase_length(self, song_id):
|
24 |
+
# touhou score does not have phrase segmentation
|
25 |
+
return None
|
26 |
+
|
27 |
+
def iter_song_phrases(self, song_id):
|
28 |
+
song = self.song_db[song_id]
|
29 |
+
song = {
|
30 |
+
"tempo": song["tempo"],
|
31 |
+
"note_start_times": [n[0] * (100 / song["tempo"]) for n in song["score"]],
|
32 |
+
"note_end_times": [n[1] * (100 / song["tempo"]) for n in song["score"]],
|
33 |
+
"note_lyrics": ["" for n in song["score"]],
|
34 |
+
"note_midi": [n[2] for n in song["score"]],
|
35 |
+
}
|
36 |
+
# touhou score does not have phrase segmentation
|
37 |
+
yield song
|
svs_eval.py → evaluation/svs_eval.py
RENAMED
@@ -1,42 +1,52 @@
|
|
1 |
import librosa
|
|
|
2 |
import numpy as np
|
3 |
import torch
|
|
|
|
|
4 |
|
|
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
8 |
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
9 |
)
|
10 |
-
return predictor
|
11 |
|
12 |
|
13 |
-
def
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
-
def
|
|
|
22 |
from audiobox_aesthetics.infer import initialize_predictor
|
|
|
23 |
predictor = initialize_predictor()
|
24 |
return predictor
|
25 |
|
26 |
|
27 |
-
|
28 |
-
score = predictor.forward([{"path": str(audio_path)}])
|
29 |
-
return score
|
30 |
-
|
31 |
|
32 |
-
def score_extract_warmpup():
|
33 |
-
from basic_pitch.inference import predict
|
34 |
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
|
38 |
-
def
|
39 |
-
model_output, midi_data, note_events =
|
40 |
metrics = {}
|
41 |
assert (
|
42 |
len(midi_data.instruments) == 1
|
@@ -61,51 +71,64 @@ def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
|
|
61 |
return np.mean(dissonant) if intervals else np.nan
|
62 |
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
parser = argparse.ArgumentParser()
|
69 |
-
parser.add_argument(
|
70 |
-
"--wav_path",
|
71 |
-
type=Path,
|
72 |
-
help="Path to the wav file",
|
73 |
-
)
|
74 |
-
parser.add_argument(
|
75 |
-
"--results_csv",
|
76 |
-
type=Path,
|
77 |
-
help="csv file to save the results",
|
78 |
-
)
|
79 |
|
80 |
-
args = parser.parse_args()
|
81 |
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
-
y, fs = librosa.load(args.wav_path, sr=None)
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
score_extractor = score_extract_warmpup()
|
89 |
-
aesthetic_predictor = initialize_audiobox_predictor()
|
90 |
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
# singmos evaluation
|
95 |
-
score = singmos_evaluation(predictor, y, fs)
|
96 |
-
metrics["singmos"] = score
|
97 |
-
|
98 |
-
# score metric evaluation
|
99 |
-
score_results = score_metric_evaluation(score_extractor, args.wav_path)
|
100 |
-
metrics.update(score_results)
|
101 |
-
|
102 |
-
# audiobox aesthetics evaluation
|
103 |
-
score_results = audiobox_aesthetics_evaluation(aesthetic_predictor, args.wav_path)
|
104 |
-
metrics.update(score_results[0])
|
105 |
-
|
106 |
-
# save results
|
107 |
with open(args.results_csv, "a") as f:
|
108 |
-
header = "file," + ",".join(
|
109 |
if f.tell() == 0:
|
110 |
f.write(header)
|
111 |
else:
|
@@ -113,8 +136,7 @@ if __name__ == "__main__":
|
|
113 |
file_header = f2.readline()
|
114 |
if file_header != header:
|
115 |
raise ValueError(f"Header mismatch: {file_header} vs {header}")
|
116 |
-
|
117 |
line = (
|
118 |
-
",".join([str(args.wav_path)] + [str(v) for v in
|
119 |
)
|
120 |
f.write(line)
|
|
|
1 |
import librosa
|
2 |
+
import soundfile as sf
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
+
import uuid
|
6 |
+
from pathlib import Path
|
7 |
|
8 |
+
# ----------- Initialization -----------
|
9 |
|
10 |
+
|
11 |
+
def init_singmos():
|
12 |
+
print("[Init] Loading SingMOS...")
|
13 |
+
return torch.hub.load(
|
14 |
"South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
|
15 |
)
|
|
|
16 |
|
17 |
|
18 |
+
def init_basic_pitch():
|
19 |
+
print("[Init] Loading BasicPitch...")
|
20 |
+
from basic_pitch.inference import predict
|
21 |
+
|
22 |
+
return predict
|
23 |
+
|
24 |
+
|
25 |
+
def init_per():
|
26 |
+
return None # TODO: implement PER evaluation
|
27 |
|
28 |
|
29 |
+
def init_audiobox_aesthetics():
|
30 |
+
print("[Init] Loading AudioboxAesthetics...")
|
31 |
from audiobox_aesthetics.infer import initialize_predictor
|
32 |
+
|
33 |
predictor = initialize_predictor()
|
34 |
return predictor
|
35 |
|
36 |
|
37 |
+
# ----------- Evaluation -----------
|
|
|
|
|
|
|
38 |
|
|
|
|
|
39 |
|
40 |
+
def eval_singmos(audio_array, sr, predictor):
|
41 |
+
wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
|
42 |
+
wav_tensor = torch.from_numpy(wav).unsqueeze(0)
|
43 |
+
length_tensor = torch.tensor([wav_tensor.shape[1]])
|
44 |
+
score = predictor(wav_tensor, length_tensor)
|
45 |
+
return {"singmos": float(score)}
|
46 |
|
47 |
|
48 |
+
def eval_melody_metrics(audio_path, pitch_extractor):
|
49 |
+
model_output, midi_data, note_events = pitch_extractor(audio_path)
|
50 |
metrics = {}
|
51 |
assert (
|
52 |
len(midi_data.instruments) == 1
|
|
|
71 |
return np.mean(dissonant) if intervals else np.nan
|
72 |
|
73 |
|
74 |
+
def eval_per(audio_array, sr, model=None):
|
75 |
+
# TODO: implement PER evaluation
|
76 |
+
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
|
|
78 |
|
79 |
+
def eval_aesthetic(audio_path, predictor):
|
80 |
+
score = predictor.forward([{"path": str(audio_path)}])
|
81 |
+
return {"aesthetic": float(score)}
|
82 |
+
|
83 |
+
|
84 |
+
# ----------- Main Function -----------
|
85 |
+
|
86 |
+
|
87 |
+
def load_evaluators(config):
|
88 |
+
loaded = {}
|
89 |
+
if "singmos" in config:
|
90 |
+
loaded["singmos"] = init_singmos()
|
91 |
+
if "melody" in config:
|
92 |
+
loaded["melody"] = init_basic_pitch()
|
93 |
+
if "per" in config:
|
94 |
+
loaded["per"] = init_per()
|
95 |
+
if "aesthetic" in config:
|
96 |
+
loaded["aesthetic"] = init_audiobox_aesthetics()
|
97 |
+
return loaded
|
98 |
+
|
99 |
+
|
100 |
+
def run_evaluation(audio_array, sr, evaluators):
|
101 |
+
results = {}
|
102 |
+
if "singmos" in evaluators:
|
103 |
+
results.update(eval_singmos(audio_array, sr, evaluators["singmos"]))
|
104 |
+
if "per" in evaluators:
|
105 |
+
results.update(eval_per(audio_array, sr, evaluators["per"]))
|
106 |
+
# create a tmp file with unique name
|
107 |
+
tmp_path = Path(".tmp") / f"{uuid.uuid4()}.wav"
|
108 |
+
sf.write(tmp_path, audio_array, sr)
|
109 |
+
if "melody" in evaluators:
|
110 |
+
results.update(eval_melody_metrics(tmp_path, evaluators["melody"]))
|
111 |
+
if "aesthetic" in evaluators:
|
112 |
+
results.update(eval_aesthetic(tmp_path, evaluators["aesthetic"]))
|
113 |
+
tmp_path.unlink()
|
114 |
+
return results
|
115 |
|
|
|
116 |
|
117 |
+
if __name__ == "__main__":
|
118 |
+
import argparse
|
|
|
|
|
119 |
|
120 |
+
parser = argparse.ArgumentParser()
|
121 |
+
parser.add_argument("--wav_path", type=str, required=True)
|
122 |
+
parser.add_argument("--results_csv", type=str, required=True)
|
123 |
+
parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
|
124 |
+
args = parser.parse_args()
|
125 |
+
audio_array, sr = librosa.load(args.wav_path, sr=None)
|
126 |
+
evaluators = load_evaluators(args.evaluators.split(","))
|
127 |
+
results = run_evaluation(audio_array, sr, evaluators)
|
128 |
+
print(results)
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
with open(args.results_csv, "a") as f:
|
131 |
+
header = "file," + ",".join(results.keys()) + "\n"
|
132 |
if f.tell() == 0:
|
133 |
f.write(header)
|
134 |
else:
|
|
|
136 |
file_header = f2.readline()
|
137 |
if file_header != header:
|
138 |
raise ValueError(f"Header mismatch: {file_header} vs {header}")
|
|
|
139 |
line = (
|
140 |
+
",".join([str(args.wav_path)] + [str(v) for v in results.values()]) + "\n"
|
141 |
)
|
142 |
f.write(line)
|
interface.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import yaml
|
3 |
+
|
4 |
+
from characters import CHARACTERS
|
5 |
+
from pipeline import SingingDialoguePipeline
|
6 |
+
|
7 |
+
|
8 |
+
class GradioInterface:
|
9 |
+
def __init__(self, options_config: str, default_config: str):
|
10 |
+
self.options = self.load_config(options_config)
|
11 |
+
self.svs_model_map = {
|
12 |
+
model["id"]: model for model in self.options["svs_models"]
|
13 |
+
}
|
14 |
+
self.default_config = self.load_config(default_config)
|
15 |
+
self.character_info = CHARACTERS
|
16 |
+
self.current_character = self.default_config["character"]
|
17 |
+
self.current_svs_model = (
|
18 |
+
f"{self.default_config['language']}-{self.default_config['svs_model']}"
|
19 |
+
)
|
20 |
+
self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
|
21 |
+
self.character_info[self.current_character].default_timbre
|
22 |
+
]
|
23 |
+
self.pipeline = SingingDialoguePipeline(self.default_config)
|
24 |
+
|
25 |
+
def load_config(self, path: str):
|
26 |
+
with open(path, "r") as f:
|
27 |
+
return yaml.safe_load(f)
|
28 |
+
|
29 |
+
def create_interface(self) -> gr.Blocks:
|
30 |
+
try:
|
31 |
+
with gr.Blocks(title="SingingSDS") as demo:
|
32 |
+
gr.Markdown("# SingingSDS: Role-Playing Singing Spoken Dialogue System")
|
33 |
+
with gr.Row():
|
34 |
+
with gr.Column(scale=1):
|
35 |
+
character_image = gr.Image(
|
36 |
+
self.character_info[self.current_character].image_path,
|
37 |
+
label="Character",
|
38 |
+
show_label=False,
|
39 |
+
)
|
40 |
+
with gr.Column(scale=2):
|
41 |
+
mic_input = gr.Audio(
|
42 |
+
sources=["microphone", "upload"],
|
43 |
+
type="filepath",
|
44 |
+
label="Speak to the character",
|
45 |
+
)
|
46 |
+
interaction_log = gr.Textbox(
|
47 |
+
label="Interaction Log", lines=3, interactive=False
|
48 |
+
)
|
49 |
+
audio_output = gr.Audio(
|
50 |
+
label="Character's Response", type="filepath", autoplay=True
|
51 |
+
)
|
52 |
+
|
53 |
+
with gr.Row():
|
54 |
+
metrics_button = gr.Button(
|
55 |
+
"Evaluate Metrics", variant="secondary"
|
56 |
+
)
|
57 |
+
metrics_output = gr.Textbox(
|
58 |
+
label="Evaluation Results", lines=3, interactive=False
|
59 |
+
)
|
60 |
+
|
61 |
+
gr.Markdown("## Configuration")
|
62 |
+
with gr.Row():
|
63 |
+
with gr.Column():
|
64 |
+
character_radio = gr.Radio(
|
65 |
+
label="Character Role",
|
66 |
+
choices=list(self.character_info.keys()),
|
67 |
+
value=self.default_config["character"],
|
68 |
+
)
|
69 |
+
with gr.Row():
|
70 |
+
asr_radio = gr.Radio(
|
71 |
+
label="ASR Model",
|
72 |
+
choices=[
|
73 |
+
(model["name"], model["id"])
|
74 |
+
for model in self.options["asr_models"]
|
75 |
+
],
|
76 |
+
value=self.default_config["asr_model"],
|
77 |
+
)
|
78 |
+
with gr.Row():
|
79 |
+
llm_radio = gr.Radio(
|
80 |
+
label="LLM Model",
|
81 |
+
choices=[
|
82 |
+
(model["name"], model["id"])
|
83 |
+
for model in self.options["llm_models"]
|
84 |
+
],
|
85 |
+
value=self.default_config["llm_model"],
|
86 |
+
)
|
87 |
+
with gr.Column():
|
88 |
+
with gr.Row():
|
89 |
+
melody_radio = gr.Radio(
|
90 |
+
label="Melody Source",
|
91 |
+
choices=[
|
92 |
+
(source["name"], source["id"])
|
93 |
+
for source in self.options["melody_sources"]
|
94 |
+
],
|
95 |
+
value=self.default_config["melody_source"],
|
96 |
+
)
|
97 |
+
with gr.Row():
|
98 |
+
svs_radio = gr.Radio(
|
99 |
+
label="SVS Model",
|
100 |
+
choices=[
|
101 |
+
(model["name"], model["id"])
|
102 |
+
for model in self.options["svs_models"]
|
103 |
+
],
|
104 |
+
value=self.current_svs_model,
|
105 |
+
)
|
106 |
+
with gr.Row():
|
107 |
+
timbre_radio = gr.Radio(
|
108 |
+
label="Singing Timbre",
|
109 |
+
choices=list(
|
110 |
+
self.svs_model_map[self.current_svs_model][
|
111 |
+
"embeddings"
|
112 |
+
].keys()
|
113 |
+
),
|
114 |
+
value=self.character_info[
|
115 |
+
self.current_character
|
116 |
+
].default_timbre,
|
117 |
+
)
|
118 |
+
character_radio.change(
|
119 |
+
fn=self.update_character,
|
120 |
+
inputs=character_radio,
|
121 |
+
outputs=[character_image, timbre_radio],
|
122 |
+
)
|
123 |
+
asr_radio.change(
|
124 |
+
fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
|
125 |
+
)
|
126 |
+
llm_radio.change(
|
127 |
+
fn=self.update_llm_model, inputs=llm_radio, outputs=llm_radio
|
128 |
+
)
|
129 |
+
svs_radio.change(
|
130 |
+
fn=self.update_svs_model,
|
131 |
+
inputs=svs_radio,
|
132 |
+
outputs=[svs_radio, timbre_radio],
|
133 |
+
)
|
134 |
+
melody_radio.change(
|
135 |
+
fn=self.update_melody_source,
|
136 |
+
inputs=melody_radio,
|
137 |
+
outputs=melody_radio,
|
138 |
+
)
|
139 |
+
timbre_radio.change(
|
140 |
+
fn=self.update_timbre, inputs=timbre_radio, outputs=timbre_radio
|
141 |
+
)
|
142 |
+
mic_input.change(
|
143 |
+
fn=self.run_pipeline,
|
144 |
+
inputs=mic_input,
|
145 |
+
outputs=[interaction_log, audio_output],
|
146 |
+
)
|
147 |
+
|
148 |
+
return demo
|
149 |
+
except Exception as e:
|
150 |
+
print(f"error: {e}")
|
151 |
+
breakpoint()
|
152 |
+
|
153 |
+
def update_character(self, character):
|
154 |
+
self.current_character = character
|
155 |
+
character_timbre = self.character_info[self.current_character].default_timbre
|
156 |
+
self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
|
157 |
+
character_timbre
|
158 |
+
]
|
159 |
+
return gr.update(value=self.character_info[character].image_path), gr.update(
|
160 |
+
value=character_timbre
|
161 |
+
)
|
162 |
+
|
163 |
+
def update_asr_model(self, asr_model):
|
164 |
+
self.pipeline.set_asr_model(asr_model)
|
165 |
+
return gr.update(value=asr_model)
|
166 |
+
|
167 |
+
def update_llm_model(self, llm_model):
|
168 |
+
self.pipeline.set_llm_model(llm_model)
|
169 |
+
return gr.update(value=llm_model)
|
170 |
+
|
171 |
+
def update_svs_model(self, svs_model):
|
172 |
+
self.current_svs_model = svs_model
|
173 |
+
character_timbre = self.character_info[self.current_character].default_timbre
|
174 |
+
self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
|
175 |
+
character_timbre
|
176 |
+
]
|
177 |
+
self.pipeline.set_svs_model(
|
178 |
+
self.svs_model_map[self.current_svs_model]["model_path"]
|
179 |
+
)
|
180 |
+
print(
|
181 |
+
f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and timbre_radio to {character_timbre}"
|
182 |
+
)
|
183 |
+
return (
|
184 |
+
gr.update(value=svs_model),
|
185 |
+
gr.update(
|
186 |
+
choices=list(
|
187 |
+
self.svs_model_map[self.current_svs_model]["embeddings"].keys()
|
188 |
+
),
|
189 |
+
value=character_timbre,
|
190 |
+
),
|
191 |
+
)
|
192 |
+
|
193 |
+
def update_melody_source(self, melody_source):
|
194 |
+
self.current_melody_source = melody_source
|
195 |
+
return gr.update(value=self.current_melody_source)
|
196 |
+
|
197 |
+
def update_timbre(self, timbre):
|
198 |
+
self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
|
199 |
+
timbre
|
200 |
+
]
|
201 |
+
return gr.update(value=timbre)
|
202 |
+
|
203 |
+
def run_pipeline(self, audio_path):
|
204 |
+
results = self.pipeline.run(
|
205 |
+
audio_path,
|
206 |
+
self.svs_model_map[self.current_svs_model]["lang"],
|
207 |
+
self.character_info[self.current_character].prompt,
|
208 |
+
svs_inference_kwargs={
|
209 |
+
"speaker": self.current_timbre,
|
210 |
+
},
|
211 |
+
max_new_tokens=100,
|
212 |
+
)
|
213 |
+
formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
|
214 |
+
return gr.update(value=formatted_logs), gr.update(value=results["svs_audio"])
|
215 |
+
|
216 |
+
def run_evaluation(self, audio, audio_sample_rate):
|
217 |
+
pass
|
modules/asr.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
from abc import ABC, abstractmethod
|
4 |
+
|
5 |
+
import librosa
|
6 |
+
import numpy as np
|
7 |
+
from transformers import pipeline
|
8 |
+
|
9 |
+
ASR_MODEL_REGISTRY = {}
|
10 |
+
|
11 |
+
|
12 |
+
class AbstractASRModel(ABC):
|
13 |
+
@abstractmethod
|
14 |
+
def __init__(
|
15 |
+
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
16 |
+
):
|
17 |
+
self.model_id = model_id
|
18 |
+
self.device = device
|
19 |
+
self.cache_dir = cache_dir
|
20 |
+
pass
|
21 |
+
|
22 |
+
@abstractmethod
|
23 |
+
def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
|
24 |
+
pass
|
25 |
+
|
26 |
+
|
27 |
+
def register_asr_model(prefix):
|
28 |
+
def wrapper(cls):
|
29 |
+
assert issubclass(cls, AbstractASRModel), f"{cls} must inherit AbstractASRModel"
|
30 |
+
ASR_MODEL_REGISTRY[prefix] = cls
|
31 |
+
return cls
|
32 |
+
|
33 |
+
return wrapper
|
34 |
+
|
35 |
+
|
36 |
+
def get_asr_model(model_id: str, device="cpu", **kwargs) -> AbstractASRModel:
|
37 |
+
for prefix, cls in ASR_MODEL_REGISTRY.items():
|
38 |
+
if model_id.startswith(prefix):
|
39 |
+
return cls(model_id, device=device, **kwargs)
|
40 |
+
raise ValueError(f"No ASR wrapper found for model: {model_id}")
|
41 |
+
|
42 |
+
|
43 |
+
@register_asr_model("openai/whisper")
|
44 |
+
class WhisperASR(AbstractASRModel):
|
45 |
+
def __init__(
|
46 |
+
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
47 |
+
):
|
48 |
+
super().__init__(model_id, device, cache_dir, **kwargs)
|
49 |
+
model_kwargs = kwargs.setdefault("model_kwargs", {})
|
50 |
+
model_kwargs["cache_dir"] = cache_dir
|
51 |
+
self.pipe = pipeline(
|
52 |
+
"automatic-speech-recognition",
|
53 |
+
model=model_id,
|
54 |
+
device=0 if device == "cuda" else -1,
|
55 |
+
**kwargs,
|
56 |
+
)
|
57 |
+
|
58 |
+
def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
|
59 |
+
if audio_sample_rate != 16000:
|
60 |
+
try:
|
61 |
+
audio, _ = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
|
62 |
+
except Exception as e:
|
63 |
+
breakpoint()
|
64 |
+
print(f"Error resampling audio: {e}")
|
65 |
+
audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
|
66 |
+
return self.pipe(audio, generate_kwargs={"language": language}).get("text", "")
|
modules/llm.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
from transformers import pipeline
|
4 |
+
|
5 |
+
LLM_MODEL_REGISTRY = {}
|
6 |
+
|
7 |
+
|
8 |
+
class AbstractLLMModel(ABC):
|
9 |
+
@abstractmethod
|
10 |
+
def __init__(
|
11 |
+
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
12 |
+
): ...
|
13 |
+
|
14 |
+
@abstractmethod
|
15 |
+
def generate(self, prompt: str, **kwargs) -> str:
|
16 |
+
pass
|
17 |
+
|
18 |
+
|
19 |
+
def register_llm_model(prefix: str):
|
20 |
+
def wrapper(cls):
|
21 |
+
assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel"
|
22 |
+
LLM_MODEL_REGISTRY[prefix] = cls
|
23 |
+
return cls
|
24 |
+
|
25 |
+
return wrapper
|
26 |
+
|
27 |
+
|
28 |
+
def get_llm_model(model_id: str, device="cpu", **kwargs) -> AbstractLLMModel:
|
29 |
+
for prefix, cls in LLM_MODEL_REGISTRY.items():
|
30 |
+
if model_id.startswith(prefix):
|
31 |
+
return cls(model_id, device=device, **kwargs)
|
32 |
+
raise ValueError(f"No LLM wrapper found for model: {model_id}")
|
33 |
+
|
34 |
+
|
35 |
+
@register_llm_model("google/gemma")
|
36 |
+
@register_llm_model("tii/") # e.g., Falcon
|
37 |
+
@register_llm_model("meta-llama")
|
38 |
+
class HFTextGenerationLLM(AbstractLLMModel):
|
39 |
+
def __init__(
|
40 |
+
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
41 |
+
):
|
42 |
+
model_kwargs = kwargs.setdefault("model_kwargs", {})
|
43 |
+
model_kwargs["cache_dir"] = cache_dir
|
44 |
+
self.pipe = pipeline(
|
45 |
+
"text-generation",
|
46 |
+
model=model_id,
|
47 |
+
device=0 if device == "cuda" else -1,
|
48 |
+
return_full_text=False,
|
49 |
+
**kwargs,
|
50 |
+
)
|
51 |
+
|
52 |
+
def generate(self, prompt: str, **kwargs) -> str:
|
53 |
+
outputs = self.pipe(prompt, **kwargs)
|
54 |
+
return outputs[0]["generated_text"] if outputs else ""
|
modules/melody.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from typing import Iterator
|
3 |
+
|
4 |
+
from data_handlers import get_melody_handler
|
5 |
+
|
6 |
+
from .utils.g2p import preprocess_text
|
7 |
+
|
8 |
+
|
9 |
+
class MelodyController:
|
10 |
+
def __init__(self, melody_source_id: str, cache_dir: str):
|
11 |
+
self.melody_source_id = melody_source_id
|
12 |
+
self.song_id = None
|
13 |
+
|
14 |
+
# load song database if needed
|
15 |
+
parts = self.melody_source_id.split("-")
|
16 |
+
self.mode = parts[0]
|
17 |
+
self.align_type = parts[1]
|
18 |
+
dataset_name = parts[-1]
|
19 |
+
if dataset_name == "none":
|
20 |
+
self.database = None
|
21 |
+
else:
|
22 |
+
handler_cls = get_melody_handler(dataset_name)
|
23 |
+
self.database = handler_cls(self.align_type, cache_dir)
|
24 |
+
|
25 |
+
def get_melody_constraints(self, max_num_phrases: int = 5) -> str:
|
26 |
+
"""Return a lyric-format prompt based on melody structure."""
|
27 |
+
if self.mode == "gen":
|
28 |
+
return ""
|
29 |
+
|
30 |
+
elif self.mode == "sample":
|
31 |
+
assert self.database is not None, "Song database is not loaded."
|
32 |
+
self.song_id = random.choice(self.database.get_song_ids())
|
33 |
+
self.reference_song = self.database.iter_song_phrases(self.song_id)
|
34 |
+
phrase_length = self.database.get_phrase_length(self.song_id)
|
35 |
+
|
36 |
+
if not phrase_length:
|
37 |
+
return ""
|
38 |
+
|
39 |
+
prompt = (
|
40 |
+
"\n请按照歌词格式回答我的问题,每句需遵循以下字数规则:"
|
41 |
+
+ "".join(
|
42 |
+
[
|
43 |
+
f"\n第{i}句:{c}个字"
|
44 |
+
for i, c in enumerate(phrase_length[:max_num_phrases], 1)
|
45 |
+
]
|
46 |
+
)
|
47 |
+
+ "\n如果没有足够的信息回答,请使用最少的句子,不要重复、不要扩展、不要加入无关内容。\n"
|
48 |
+
)
|
49 |
+
return prompt
|
50 |
+
|
51 |
+
else:
|
52 |
+
raise ValueError(f"Unsupported melody mode: {self.mode}")
|
53 |
+
|
54 |
+
def generate_score(
|
55 |
+
self, lyrics: str, language: str
|
56 |
+
) -> list[tuple[float, float, str, int]]:
|
57 |
+
"""
|
58 |
+
lyrics: [lyric, ...]
|
59 |
+
returns: [(start, end, lyric, pitch), ...]
|
60 |
+
"""
|
61 |
+
text_list = preprocess_text(lyrics, language)
|
62 |
+
if self.mode == "gen" and self.align_type == "random":
|
63 |
+
return self._generate_random_score(text_list)
|
64 |
+
|
65 |
+
elif self.mode == "sample":
|
66 |
+
if not self.reference_song:
|
67 |
+
raise RuntimeError(
|
68 |
+
"Must call get_melody_constraints() before generate_score() in sample mode."
|
69 |
+
)
|
70 |
+
return self._align_text_to_score(
|
71 |
+
text_list, self.reference_song, self.align_type
|
72 |
+
)
|
73 |
+
|
74 |
+
else:
|
75 |
+
raise ValueError(f"Unsupported melody_source_id: {self.melody_source_id}")
|
76 |
+
|
77 |
+
def _generate_random_score(self, text_list: list[str]):
|
78 |
+
st = 0
|
79 |
+
score = []
|
80 |
+
for lyric in text_list:
|
81 |
+
pitch = random.randint(57, 69)
|
82 |
+
duration = round(random.uniform(0.1, 0.5), 4)
|
83 |
+
ed = st + duration
|
84 |
+
score.append((st, ed, lyric, pitch))
|
85 |
+
st = ed
|
86 |
+
return score
|
87 |
+
|
88 |
+
def _align_text_to_score(
|
89 |
+
self,
|
90 |
+
text_list: list[str],
|
91 |
+
song_phrase_iterator: Iterator[dict],
|
92 |
+
align_type: str,
|
93 |
+
):
|
94 |
+
score = []
|
95 |
+
text_idx = 0
|
96 |
+
|
97 |
+
while text_idx < len(text_list):
|
98 |
+
reference = next(song_phrase_iterator)
|
99 |
+
for st, ed, ref_lyric, pitch in zip(
|
100 |
+
reference["note_start_times"],
|
101 |
+
reference["note_end_times"],
|
102 |
+
reference["note_lyrics"],
|
103 |
+
reference["note_midi"],
|
104 |
+
):
|
105 |
+
assert ref_lyric not in [
|
106 |
+
"<AP>",
|
107 |
+
"<SP>",
|
108 |
+
], f"Proccessed {self.melody_source_id} score segments should not contain <AP> or <SP>." # TODO: remove in PR, only for debug
|
109 |
+
if pitch == 0:
|
110 |
+
score.append((st, ed, ref_lyric, pitch))
|
111 |
+
elif ref_lyric in ["-", "——"] and align_type == "lyric":
|
112 |
+
score.append((st, ed, ref_lyric, pitch))
|
113 |
+
text_idx += 1
|
114 |
+
else:
|
115 |
+
score.append((st, ed, text_list[text_idx], pitch))
|
116 |
+
text_idx += 1
|
117 |
+
return score
|
modules/svs/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import AbstractSVSModel
|
2 |
+
from .registry import SVS_MODEL_REGISTRY, get_svs_model, register_svs_model
|
3 |
+
from .espnet import ESPNetSVS
|
4 |
+
|
5 |
+
__all__ = [
|
6 |
+
"AbstractSVSModel",
|
7 |
+
"get_svs_model",
|
8 |
+
"register_svs_model",
|
9 |
+
"SVS_MODEL_REGISTRY",
|
10 |
+
]
|
modules/svs/base.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class AbstractSVSModel(ABC):
|
7 |
+
@abstractmethod
|
8 |
+
def __init__(
|
9 |
+
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
10 |
+
): ...
|
11 |
+
|
12 |
+
@abstractmethod
|
13 |
+
def synthesize(
|
14 |
+
self,
|
15 |
+
score: list[tuple[float, float, str, int]],
|
16 |
+
**kwargs,
|
17 |
+
) -> tuple[np.ndarray, int]:
|
18 |
+
"""
|
19 |
+
Synthesize singing audio from music score.
|
20 |
+
"""
|
21 |
+
pass
|
modules/svs/espnet.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
from modules.utils.g2p import (
|
6 |
+
kana_to_phonemes_openjtalk,
|
7 |
+
pinyin_to_phonemes_ace,
|
8 |
+
pinyin_to_phonemes_opencpop,
|
9 |
+
)
|
10 |
+
|
11 |
+
from .base import AbstractSVSModel
|
12 |
+
from .registry import register_svs_model
|
13 |
+
|
14 |
+
|
15 |
+
@register_svs_model("espnet/")
|
16 |
+
class ESPNetSVS(AbstractSVSModel):
|
17 |
+
def __init__(self, model_id: str, device="cpu", cache_dir="cache", **kwargs):
|
18 |
+
from espnet2.bin.svs_inference import SingingGenerate
|
19 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
20 |
+
|
21 |
+
print(f"Downloading {model_id} to {cache_dir}") # TODO: should improve log code
|
22 |
+
downloaded = ModelDownloader(cache_dir).download_and_unpack(model_id)
|
23 |
+
print(f"Downloaded {model_id} to {cache_dir}") # TODO: should improve log code
|
24 |
+
self.model = SingingGenerate(
|
25 |
+
train_config=downloaded["train_config"],
|
26 |
+
model_file=downloaded["model_file"],
|
27 |
+
device=device,
|
28 |
+
)
|
29 |
+
self.model_id = model_id
|
30 |
+
self.output_sample_rate = self.model.fs
|
31 |
+
self.phoneme_mappers = self._build_phoneme_mappers()
|
32 |
+
|
33 |
+
def _build_phoneme_mappers(self) -> dict[str, Callable[[str], list[str]]]:
|
34 |
+
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
35 |
+
phoneme_mappers = {
|
36 |
+
"mandarin": pinyin_to_phonemes_opencpop,
|
37 |
+
}
|
38 |
+
elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
|
39 |
+
|
40 |
+
def mandarin_mapper(pinyin: str) -> list[str]:
|
41 |
+
phns = pinyin_to_phonemes_ace(pinyin)
|
42 |
+
return [phn + "@zh" for phn in phns]
|
43 |
+
|
44 |
+
def japanese_mapper(kana: str) -> list[str]:
|
45 |
+
phones = kana_to_phonemes_openjtalk(kana)
|
46 |
+
return [phn + "@jp" for phn in phones]
|
47 |
+
|
48 |
+
phoneme_mappers = {
|
49 |
+
"mandarin": mandarin_mapper,
|
50 |
+
"japanese": japanese_mapper,
|
51 |
+
}
|
52 |
+
else:
|
53 |
+
phoneme_mappers = {}
|
54 |
+
return phoneme_mappers
|
55 |
+
|
56 |
+
def _preprocess(self, score: list[tuple[float, float, str, int]], language: str):
|
57 |
+
if language not in self.phoneme_mappers:
|
58 |
+
raise ValueError(f"Unsupported language: {language} for {self.model_id}")
|
59 |
+
phoneme_mapper = self.phoneme_mappers[language]
|
60 |
+
|
61 |
+
# text to phoneme
|
62 |
+
notes = []
|
63 |
+
phns = []
|
64 |
+
pre_phn = None
|
65 |
+
for st, ed, text, pitch in score:
|
66 |
+
assert text not in [
|
67 |
+
"<AP>",
|
68 |
+
"<SP>",
|
69 |
+
], f"Proccessed score segments should not contain <AP> or <SP>. {score}" # TODO: remove in PR, only for debug
|
70 |
+
if text == "AP" or text == "SP":
|
71 |
+
lyric_units = [text]
|
72 |
+
phn_units = [text]
|
73 |
+
elif text == "-" or text == "——":
|
74 |
+
lyric_units = [text]
|
75 |
+
if pre_phn is None:
|
76 |
+
raise ValueError(
|
77 |
+
f"Text `{text}` cannot be recognized by {self.model_id}. Lyrics cannot start with a lyric continuation symbol `-` or `——`"
|
78 |
+
)
|
79 |
+
phn_units = [pre_phn]
|
80 |
+
else:
|
81 |
+
try:
|
82 |
+
lyric_units = phoneme_mapper(text)
|
83 |
+
except ValueError as e:
|
84 |
+
raise ValueError(
|
85 |
+
f"Text `{text}` cannot be recognized by {self.model_id}"
|
86 |
+
) from e
|
87 |
+
phn_units = lyric_units
|
88 |
+
notes.append((st, ed, "".join(lyric_units), pitch, "_".join(phn_units)))
|
89 |
+
phns.extend(phn_units)
|
90 |
+
pre_phn = phn_units[-1]
|
91 |
+
|
92 |
+
batch = {
|
93 |
+
"score": {
|
94 |
+
"tempo": 120, # does not affect svs result, as note durations are in time unit
|
95 |
+
"notes": notes,
|
96 |
+
},
|
97 |
+
"text": " ".join(phns),
|
98 |
+
}
|
99 |
+
return batch
|
100 |
+
|
101 |
+
def synthesize(
|
102 |
+
self, score: list[tuple[float, float, str, int]], language: str, **kwargs
|
103 |
+
):
|
104 |
+
batch = self._preprocess(score, language)
|
105 |
+
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
106 |
+
sid = np.array([int(kwargs["speaker"])])
|
107 |
+
output_dict = self.model(batch, sids=sid)
|
108 |
+
elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
|
109 |
+
langs = {
|
110 |
+
"zh": 2,
|
111 |
+
"jp": 1,
|
112 |
+
}
|
113 |
+
if language not in langs:
|
114 |
+
raise ValueError(
|
115 |
+
f"Unsupported language: {language} for {self.model_id}"
|
116 |
+
)
|
117 |
+
lid = np.array([langs[language]])
|
118 |
+
spk_embed = np.load(kwargs["speaker"])
|
119 |
+
output_dict = self.model(batch, lids=lid, spembs=spk_embed)
|
120 |
+
else:
|
121 |
+
raise NotImplementedError(f"Model {self.model_id} not supported")
|
122 |
+
wav_info = output_dict["wav"].cpu().numpy()
|
123 |
+
return wav_info, self.output_sample_rate
|
modules/svs/registry.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import AbstractSVSModel
|
2 |
+
|
3 |
+
SVS_MODEL_REGISTRY = {}
|
4 |
+
|
5 |
+
|
6 |
+
def register_svs_model(prefix: str):
|
7 |
+
def wrapper(cls):
|
8 |
+
assert issubclass(cls, AbstractSVSModel), f"{cls} must inherit AbstractSVSModel"
|
9 |
+
SVS_MODEL_REGISTRY[prefix] = cls
|
10 |
+
return cls
|
11 |
+
|
12 |
+
return wrapper
|
13 |
+
|
14 |
+
|
15 |
+
def get_svs_model(model_id: str, device="cpu", **kwargs) -> AbstractSVSModel:
|
16 |
+
for prefix, cls in SVS_MODEL_REGISTRY.items():
|
17 |
+
if model_id.startswith(prefix):
|
18 |
+
return cls(model_id, device=device, **kwargs)
|
19 |
+
raise ValueError(f"No SVS wrapper found for model: {model_id}")
|
modules/utils/g2p.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
import warnings
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
from kanjiconv import KanjiConv
|
7 |
+
from pypinyin import lazy_pinyin
|
8 |
+
|
9 |
+
from .resources.pinyin_dict import PINYIN_DICT
|
10 |
+
|
11 |
+
kanji_to_kana = KanjiConv()
|
12 |
+
|
13 |
+
yoon_map = {
|
14 |
+
"ぁ": "あ",
|
15 |
+
"ぃ": "い",
|
16 |
+
"ぅ": "う",
|
17 |
+
"ぇ": "え",
|
18 |
+
"ぉ": "お",
|
19 |
+
"ゃ": "や",
|
20 |
+
"ゅ": "ゆ",
|
21 |
+
"ょ": "よ",
|
22 |
+
"ゎ": "わ",
|
23 |
+
}
|
24 |
+
|
25 |
+
# ACE_phonemes
|
26 |
+
with open(Path(__file__).parent / "resources" / "all_plans.json", "r") as f:
|
27 |
+
ace_phonemes_all_plans = json.load(f)
|
28 |
+
for plan in ace_phonemes_all_plans["plans"]:
|
29 |
+
if plan["language"] == "zh":
|
30 |
+
ace_phonemes_zh_plan = plan
|
31 |
+
break
|
32 |
+
|
33 |
+
|
34 |
+
def preprocess_text(text: str, language: str) -> list[str]:
|
35 |
+
if language == "mandarin":
|
36 |
+
text_list = to_pinyin(text)
|
37 |
+
elif language == "japanese":
|
38 |
+
text_list = to_kana(text)
|
39 |
+
else:
|
40 |
+
raise ValueError(f"Other languages are not supported")
|
41 |
+
return text_list
|
42 |
+
|
43 |
+
|
44 |
+
def to_pinyin(text: str) -> list[str]:
|
45 |
+
pinyin_list = lazy_pinyin(text)
|
46 |
+
text_list = []
|
47 |
+
for text in pinyin_list:
|
48 |
+
if text[0] == "S" or text[0] == "A" or text[0] == "-":
|
49 |
+
sp_strs = re.findall(r"-|AP|SP", text)
|
50 |
+
for phn in sp_strs:
|
51 |
+
text_list.append(phn)
|
52 |
+
else:
|
53 |
+
text_list.append(text)
|
54 |
+
return text_list
|
55 |
+
|
56 |
+
|
57 |
+
def replace_chouonpu(hiragana_text: str) -> str:
|
58 |
+
"""process「ー」since the previous packages didn't support"""
|
59 |
+
vowels = {
|
60 |
+
"あ": "あ",
|
61 |
+
"い": "い",
|
62 |
+
"う": "う",
|
63 |
+
"え": "え",
|
64 |
+
"お": "う",
|
65 |
+
"か": "あ",
|
66 |
+
"き": "い",
|
67 |
+
"く": "う",
|
68 |
+
"け": "え",
|
69 |
+
"こ": "う",
|
70 |
+
"さ": "あ",
|
71 |
+
"し": "い",
|
72 |
+
"す": "う",
|
73 |
+
"せ": "え",
|
74 |
+
"そ": "う",
|
75 |
+
"た": "あ",
|
76 |
+
"ち": "い",
|
77 |
+
"つ": "う",
|
78 |
+
"て": "え",
|
79 |
+
"と": "う",
|
80 |
+
"な": "あ",
|
81 |
+
"に": "い",
|
82 |
+
"ぬ": "う",
|
83 |
+
"ね": "え",
|
84 |
+
"の": "う",
|
85 |
+
"は": "あ",
|
86 |
+
"ひ": "い",
|
87 |
+
"ふ": "う",
|
88 |
+
"へ": "え",
|
89 |
+
"ほ": "う",
|
90 |
+
"ま": "あ",
|
91 |
+
"み": "い",
|
92 |
+
"む": "う",
|
93 |
+
"め": "え",
|
94 |
+
"も": "う",
|
95 |
+
"や": "あ",
|
96 |
+
"ゆ": "う",
|
97 |
+
"よ": "う",
|
98 |
+
"ら": "あ",
|
99 |
+
"り": "い",
|
100 |
+
"る": "う",
|
101 |
+
"れ": "え",
|
102 |
+
"ろ": "う",
|
103 |
+
"わ": "あ",
|
104 |
+
"を": "う",
|
105 |
+
}
|
106 |
+
new_text = []
|
107 |
+
for i, char in enumerate(hiragana_text):
|
108 |
+
if char == "ー" and i > 0:
|
109 |
+
prev_char = new_text[-1]
|
110 |
+
if prev_char in yoon_map:
|
111 |
+
prev_char = yoon_map[prev_char]
|
112 |
+
new_text.append(vowels.get(prev_char, prev_char))
|
113 |
+
else:
|
114 |
+
new_text.append(char)
|
115 |
+
return "".join(new_text)
|
116 |
+
|
117 |
+
|
118 |
+
def to_kana(text: str) -> list[str]:
|
119 |
+
hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", ""))
|
120 |
+
hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ")
|
121 |
+
final_ls = []
|
122 |
+
for subword in hiragana_text_wl:
|
123 |
+
sl_prev = 0
|
124 |
+
for i in range(len(subword) - 1):
|
125 |
+
if sl_prev >= len(subword) - 1:
|
126 |
+
break
|
127 |
+
sl = sl_prev + 1
|
128 |
+
if subword[sl] in yoon_map:
|
129 |
+
final_ls.append(subword[sl_prev : sl + 1])
|
130 |
+
sl_prev += 2
|
131 |
+
else:
|
132 |
+
final_ls.append(subword[sl_prev])
|
133 |
+
sl_prev += 1
|
134 |
+
final_ls.append(subword[sl_prev])
|
135 |
+
return final_ls
|
136 |
+
|
137 |
+
|
138 |
+
def kana_to_phonemes_openjtalk(kana: str) -> list[str]:
|
139 |
+
import pyopenjtalk
|
140 |
+
|
141 |
+
with warnings.catch_warnings(record=True) as w:
|
142 |
+
warnings.simplefilter("always")
|
143 |
+
# add space between each character
|
144 |
+
kana = " ".join(list(kana))
|
145 |
+
# phones is a str object separated by space
|
146 |
+
phones = pyopenjtalk.g2p(kana, kana=False)
|
147 |
+
if len(w) > 0:
|
148 |
+
for warning in w:
|
149 |
+
if "No phoneme" in str(warning.message):
|
150 |
+
raise ValueError(f"No phoneme found for {kana}. {warning.message}")
|
151 |
+
phones = phones.split(" ")
|
152 |
+
return phones
|
153 |
+
|
154 |
+
|
155 |
+
def pinyin_to_phonemes_opencpop(pinyin: str) -> list[str]:
|
156 |
+
pinyin = pinyin.lower()
|
157 |
+
if pinyin in ace_phonemes_zh_plan["dict"]:
|
158 |
+
phns = ace_phonemes_zh_plan["dict"][pinyin]
|
159 |
+
return phns
|
160 |
+
elif pinyin in ace_phonemes_zh_plan["syllable_alias"]:
|
161 |
+
phns = ace_phonemes_zh_plan["dict"][
|
162 |
+
ace_phonemes_zh_plan["syllable_alias"][pinyin]
|
163 |
+
]
|
164 |
+
return phns
|
165 |
+
else:
|
166 |
+
raise ValueError(f"{pinyin} not registered in Opencpop phoneme dict")
|
167 |
+
|
168 |
+
|
169 |
+
def pinyin_to_phonemes_ace(pinyin: str) -> list[str]:
|
170 |
+
pinyin = pinyin.lower()
|
171 |
+
if pinyin in PINYIN_DICT:
|
172 |
+
phns = PINYIN_DICT[pinyin]
|
173 |
+
return phns
|
174 |
+
else:
|
175 |
+
raise ValueError(f"{pinyin} not registered in ACE phoneme dict")
|
{resource → modules/utils/resources}/all_plans.json
RENAMED
File without changes
|
{resource → modules/utils/resources}/pinyin_dict.py
RENAMED
File without changes
|
modules/utils/text_normalize.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
|
5 |
+
def remove_non_zh_jp(text: str) -> str:
|
6 |
+
pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef]"
|
7 |
+
return re.sub(pattern, "", text)
|
8 |
+
|
9 |
+
|
10 |
+
def truncate_sentences(text: str, max_sentences: int) -> str:
|
11 |
+
sentences = re.split(r"(?<=[。!?])", text)
|
12 |
+
return "".join(sentences[:max_sentences]).strip()
|
13 |
+
|
14 |
+
|
15 |
+
def clean_llm_output(
|
16 |
+
text: str,
|
17 |
+
max_sentences: Optional[int] = 2,
|
18 |
+
seg_syb: str = " ",
|
19 |
+
language: str = "mandarin",
|
20 |
+
) -> str:
|
21 |
+
if language not in ["mandarin", "japanese"]:
|
22 |
+
raise NotImplementedError(f"Unsupported language: {language}")
|
23 |
+
text = text.strip()
|
24 |
+
if max_sentences is not None:
|
25 |
+
text = truncate_sentences(text, max_sentences)
|
26 |
+
text = remove_non_zh_jp(text)
|
27 |
+
text = re.sub(r"[^\w\s\u4e00-\u9fff]", " ", text) # Remove punctuation
|
28 |
+
text = re.sub(r"\s+", " ", text) # Normalize whitespace
|
29 |
+
text = text.replace("\n", seg_syb)
|
30 |
+
text = text.replace(" ", seg_syb)
|
31 |
+
return text
|
offline_process/create_features.py
DELETED
@@ -1,71 +0,0 @@
|
|
1 |
-
from datasets import load_dataset, concatenate_datasets
|
2 |
-
|
3 |
-
ds = load_dataset("espnet/ace-kising-segments", cache_dir="cache")
|
4 |
-
|
5 |
-
combined = concatenate_datasets([ds["train"], ds["validation"], ds["test"]])
|
6 |
-
|
7 |
-
# 2. filter rows by singer: baber
|
8 |
-
combined = combined.filter(lambda x: x["singer"] == "barber")
|
9 |
-
|
10 |
-
# 3. create a new column, which counts the nonzero numbers in the list in the note_midi column
|
11 |
-
combined = combined.map(
|
12 |
-
lambda x: {
|
13 |
-
"note_midi_length": len([n for n in x["note_midi"] if n != 0]),
|
14 |
-
"lyric_word_length": len(
|
15 |
-
[word for word in x["note_lyrics"] if word not in ["<AP>", "<SP>", "-"]]
|
16 |
-
), # counts the number of actual words (or characters for, e.g., Chinese/Japanese)
|
17 |
-
}
|
18 |
-
)
|
19 |
-
combined = combined.map(
|
20 |
-
lambda x: {
|
21 |
-
"lyric_word_length": len(
|
22 |
-
[word for word in x["note_lyrics"] if word not in ["<AP>", "<SP>", "-"]]
|
23 |
-
)
|
24 |
-
} # counts the number of actual words (or characters for, e.g., Chinese/Japanese)
|
25 |
-
)
|
26 |
-
|
27 |
-
# 4. sort by segment_id
|
28 |
-
combined = combined.sort("segment_id")
|
29 |
-
|
30 |
-
# 5. iterate over rows
|
31 |
-
prev_songid = None
|
32 |
-
prev_song_segment_id = None
|
33 |
-
song2note_lengths = {}
|
34 |
-
song2word_lengths = {}
|
35 |
-
for row in combined:
|
36 |
-
# segment_id: kising_barber_{songid}_{song_segment_id}
|
37 |
-
_, _, songid, song_segment_id = row["segment_id"].split("_")
|
38 |
-
if prev_songid != songid:
|
39 |
-
if prev_songid is not None:
|
40 |
-
assert (
|
41 |
-
song_segment_id == "001"
|
42 |
-
), f"prev_songid: {prev_songid}, songid: {songid}, song_segment_id: {song_segment_id}"
|
43 |
-
song2note_lengths[f"kising_{songid}"] = [row["note_midi_length"]]
|
44 |
-
song2word_lengths[f"kising_{songid}"] = [row["lyric_word_length"]]
|
45 |
-
else:
|
46 |
-
assert (
|
47 |
-
int(song_segment_id) >= int(prev_song_segment_id) + 1
|
48 |
-
), f"prev_song_segment_id: {prev_song_segment_id}, song_segment_id: {song_segment_id}"
|
49 |
-
song2note_lengths[f"kising_{songid}"].append(row["note_midi_length"])
|
50 |
-
song2word_lengths[f"kising_{songid}"].append(row["lyric_word_length"])
|
51 |
-
prev_songid = songid
|
52 |
-
prev_song_segment_id = song_segment_id
|
53 |
-
|
54 |
-
# 6. write to json
|
55 |
-
import json
|
56 |
-
|
57 |
-
with open("data/song2note_lengths.json", "w") as f:
|
58 |
-
json.dump(song2note_lengths, f, indent=4)
|
59 |
-
|
60 |
-
with open("data/song2word_lengths.json", "w") as f:
|
61 |
-
json.dump(song2word_lengths, f, indent=4)
|
62 |
-
|
63 |
-
# 7. push score segments to hub
|
64 |
-
# remove audio and singer columns
|
65 |
-
combined = combined.remove_columns(["audio", "singer"])
|
66 |
-
# replace kising_barber_ with kising_
|
67 |
-
combined = combined.map(
|
68 |
-
lambda x: {"segment_id": x["segment_id"].replace("kising_barber_", "kising_")}
|
69 |
-
)
|
70 |
-
# upload to hub
|
71 |
-
combined.push_to_hub("jhansss/kising_score_segments")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
path.sh
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
|
3 |
-
. ~/workspace/SingingSDS/activate_python.sh
|
|
|
|
|
|
|
|
pipeline.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import time
|
3 |
+
import librosa
|
4 |
+
|
5 |
+
from modules.asr import get_asr_model
|
6 |
+
from modules.llm import get_llm_model
|
7 |
+
from modules.svs import get_svs_model
|
8 |
+
from evaluation.svs_eval import load_evaluators, run_evaluation
|
9 |
+
from modules.melody import MelodyController
|
10 |
+
from modules.utils.text_normalize import clean_llm_output
|
11 |
+
|
12 |
+
|
13 |
+
class SingingDialoguePipeline:
|
14 |
+
def __init__(self, config: dict):
|
15 |
+
if "device" in config:
|
16 |
+
self.device = config["device"]
|
17 |
+
else:
|
18 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
+
self.cache_dir = config["cache_dir"]
|
20 |
+
self.asr = get_asr_model(
|
21 |
+
config["asr_model"], device=self.device, cache_dir=self.cache_dir
|
22 |
+
)
|
23 |
+
self.llm = get_llm_model(
|
24 |
+
config["llm_model"], device=self.device, cache_dir=self.cache_dir
|
25 |
+
)
|
26 |
+
self.svs = get_svs_model(
|
27 |
+
config["svs_model"], device=self.device, cache_dir=self.cache_dir
|
28 |
+
)
|
29 |
+
self.melody_controller = MelodyController(
|
30 |
+
config["melody_source"], self.cache_dir
|
31 |
+
)
|
32 |
+
self.track_latency = config.get("track_latency", False)
|
33 |
+
self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
|
34 |
+
|
35 |
+
def set_asr_model(self, asr_model: str):
|
36 |
+
self.asr = get_asr_model(
|
37 |
+
asr_model, device=self.device, cache_dir=self.cache_dir
|
38 |
+
)
|
39 |
+
|
40 |
+
def set_llm_model(self, llm_model: str):
|
41 |
+
self.llm = get_llm_model(
|
42 |
+
llm_model, device=self.device, cache_dir=self.cache_dir
|
43 |
+
)
|
44 |
+
|
45 |
+
def set_svs_model(self, svs_model: str):
|
46 |
+
self.svs = get_svs_model(
|
47 |
+
svs_model, device=self.device, cache_dir=self.cache_dir
|
48 |
+
)
|
49 |
+
|
50 |
+
def set_melody_controller(self, melody_source: str):
|
51 |
+
self.melody_controller = MelodyController(melody_source, self.cache_dir)
|
52 |
+
|
53 |
+
def run(
|
54 |
+
self,
|
55 |
+
audio_path,
|
56 |
+
language,
|
57 |
+
prompt_template,
|
58 |
+
svs_inference_kwargs,
|
59 |
+
max_new_tokens=100,
|
60 |
+
):
|
61 |
+
if self.track_latency:
|
62 |
+
asr_start_time = time.time()
|
63 |
+
audio_array, audio_sample_rate = librosa.load(audio_path, sr=16000)
|
64 |
+
asr_result = self.asr.transcribe(
|
65 |
+
audio_array, audio_sample_rate=audio_sample_rate, language=language
|
66 |
+
)
|
67 |
+
if self.track_latency:
|
68 |
+
asr_end_time = time.time()
|
69 |
+
asr_latency = asr_end_time - asr_start_time
|
70 |
+
melody_prompt = self.melody_controller.get_melody_constraints()
|
71 |
+
prompt = prompt_template.format(melody_prompt, asr_result)
|
72 |
+
if self.track_latency:
|
73 |
+
llm_start_time = time.time()
|
74 |
+
output = self.llm.generate(prompt, max_new_tokens=max_new_tokens)
|
75 |
+
if self.track_latency:
|
76 |
+
llm_end_time = time.time()
|
77 |
+
llm_latency = llm_end_time - llm_start_time
|
78 |
+
print(f"llm output: {output}确认一下是不是不含prompt的")
|
79 |
+
llm_response = clean_llm_output(output, language=language)
|
80 |
+
score = self.melody_controller.generate_score(llm_response, language)
|
81 |
+
if self.track_latency:
|
82 |
+
svs_start_time = time.time()
|
83 |
+
singing_audio, sample_rate = self.svs.synthesize(
|
84 |
+
score, language=language, **svs_inference_kwargs
|
85 |
+
)
|
86 |
+
if self.track_latency:
|
87 |
+
svs_end_time = time.time()
|
88 |
+
svs_latency = svs_end_time - svs_start_time
|
89 |
+
results = {
|
90 |
+
"asr_text": asr_result,
|
91 |
+
"llm_text": llm_response,
|
92 |
+
"svs_audio": (singing_audio, sample_rate),
|
93 |
+
}
|
94 |
+
if self.track_latency:
|
95 |
+
results["metrics"].update({
|
96 |
+
"asr_latency": asr_latency,
|
97 |
+
"llm_latency": llm_latency,
|
98 |
+
"svs_latency": svs_latency,
|
99 |
+
})
|
100 |
+
return results
|
101 |
+
|
102 |
+
def evaluate(self, audio, sample_rate):
|
103 |
+
return run_evaluation(audio, sample_rate, self.evaluators)
|
{resource → resources}/__init__.py
RENAMED
File without changes
|
resources/all_plans.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
{resource → resources}/midi-note.scp
RENAMED
File without changes
|
resources/pinyin_dict.py
ADDED
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from Opencpop's pinyin to phoneme mapping table:
|
2 |
+
# https://wenet.org.cn/opencpop/resources/annotationformat/
|
3 |
+
PINYIN_DICT = {
|
4 |
+
"a": ("a",),
|
5 |
+
"ai": ("ai",),
|
6 |
+
"an": ("an",),
|
7 |
+
"ang": ("ang",),
|
8 |
+
"ao": ("ao",),
|
9 |
+
"ba": ("b", "a"),
|
10 |
+
"bai": ("b", "ai"),
|
11 |
+
"ban": ("b", "an"),
|
12 |
+
"bang": ("b", "ang"),
|
13 |
+
"bao": ("b", "ao"),
|
14 |
+
"bei": ("b", "ei"),
|
15 |
+
"ben": ("b", "en"),
|
16 |
+
"beng": ("b", "eng"),
|
17 |
+
"bi": ("b", "i"),
|
18 |
+
"bian": ("b", "ian"),
|
19 |
+
"biao": ("b", "iao"),
|
20 |
+
"bie": ("b", "ie"),
|
21 |
+
"bin": ("b", "in"),
|
22 |
+
"bing": ("b", "ing"),
|
23 |
+
"bo": ("b", "o"),
|
24 |
+
"bu": ("b", "u"),
|
25 |
+
"ca": ("c", "a"),
|
26 |
+
"cai": ("c", "ai"),
|
27 |
+
"can": ("c", "an"),
|
28 |
+
"cang": ("c", "ang"),
|
29 |
+
"cao": ("c", "ao"),
|
30 |
+
"ce": ("c", "e"),
|
31 |
+
"cei": ("c", "ei"),
|
32 |
+
"cen": ("c", "en"),
|
33 |
+
"ceng": ("c", "eng"),
|
34 |
+
"cha": ("ch", "a"),
|
35 |
+
"chai": ("ch", "ai"),
|
36 |
+
"chan": ("ch", "an"),
|
37 |
+
"chang": ("ch", "ang"),
|
38 |
+
"chao": ("ch", "ao"),
|
39 |
+
"che": ("ch", "e"),
|
40 |
+
"chen": ("ch", "en"),
|
41 |
+
"cheng": ("ch", "eng"),
|
42 |
+
"chi": ("ch", "i"),
|
43 |
+
"chong": ("ch", "ong"),
|
44 |
+
"chou": ("ch", "ou"),
|
45 |
+
"chu": ("ch", "u"),
|
46 |
+
"chua": ("ch", "ua"),
|
47 |
+
"chuai": ("ch", "uai"),
|
48 |
+
"chuan": ("ch", "uan"),
|
49 |
+
"chuang": ("ch", "uang"),
|
50 |
+
"chui": ("ch", "ui"),
|
51 |
+
"chun": ("ch", "un"),
|
52 |
+
"chuo": ("ch", "uo"),
|
53 |
+
"ci": ("c", "i"),
|
54 |
+
"cong": ("c", "ong"),
|
55 |
+
"cou": ("c", "ou"),
|
56 |
+
"cu": ("c", "u"),
|
57 |
+
"cuan": ("c", "uan"),
|
58 |
+
"cui": ("c", "ui"),
|
59 |
+
"cun": ("c", "un"),
|
60 |
+
"cuo": ("c", "uo"),
|
61 |
+
"da": ("d", "a"),
|
62 |
+
"dai": ("d", "ai"),
|
63 |
+
"dan": ("d", "an"),
|
64 |
+
"dang": ("d", "ang"),
|
65 |
+
"dao": ("d", "ao"),
|
66 |
+
"de": ("d", "e"),
|
67 |
+
"dei": ("d", "ei"),
|
68 |
+
"den": ("d", "en"),
|
69 |
+
"deng": ("d", "eng"),
|
70 |
+
"di": ("d", "i"),
|
71 |
+
"dia": ("d", "ia"),
|
72 |
+
"dian": ("d", "ian"),
|
73 |
+
"diao": ("d", "iao"),
|
74 |
+
"die": ("d", "ie"),
|
75 |
+
"ding": ("d", "ing"),
|
76 |
+
"diu": ("d", "iu"),
|
77 |
+
"dong": ("d", "ong"),
|
78 |
+
"dou": ("d", "ou"),
|
79 |
+
"du": ("d", "u"),
|
80 |
+
"duan": ("d", "uan"),
|
81 |
+
"dui": ("d", "ui"),
|
82 |
+
"dun": ("d", "un"),
|
83 |
+
"duo": ("d", "uo"),
|
84 |
+
"e": ("e",),
|
85 |
+
"ei": ("ei",),
|
86 |
+
"en": ("en",),
|
87 |
+
"eng": ("eng",),
|
88 |
+
"er": ("er",),
|
89 |
+
"fa": ("f", "a"),
|
90 |
+
"fan": ("f", "an"),
|
91 |
+
"fang": ("f", "ang"),
|
92 |
+
"fei": ("f", "ei"),
|
93 |
+
"fen": ("f", "en"),
|
94 |
+
"feng": ("f", "eng"),
|
95 |
+
"fo": ("f", "o"),
|
96 |
+
"fou": ("f", "ou"),
|
97 |
+
"fu": ("f", "u"),
|
98 |
+
"ga": ("g", "a"),
|
99 |
+
"gai": ("g", "ai"),
|
100 |
+
"gan": ("g", "an"),
|
101 |
+
"gang": ("g", "ang"),
|
102 |
+
"gao": ("g", "ao"),
|
103 |
+
"ge": ("g", "e"),
|
104 |
+
"gei": ("g", "ei"),
|
105 |
+
"gen": ("g", "en"),
|
106 |
+
"geng": ("g", "eng"),
|
107 |
+
"gong": ("g", "ong"),
|
108 |
+
"gou": ("g", "ou"),
|
109 |
+
"gu": ("g", "u"),
|
110 |
+
"gua": ("g", "ua"),
|
111 |
+
"guai": ("g", "uai"),
|
112 |
+
"guan": ("g", "uan"),
|
113 |
+
"guang": ("g", "uang"),
|
114 |
+
"gui": ("g", "ui"),
|
115 |
+
"gun": ("g", "un"),
|
116 |
+
"guo": ("g", "uo"),
|
117 |
+
"ha": ("h", "a"),
|
118 |
+
"hai": ("h", "ai"),
|
119 |
+
"han": ("h", "an"),
|
120 |
+
"hang": ("h", "ang"),
|
121 |
+
"hao": ("h", "ao"),
|
122 |
+
"he": ("h", "e"),
|
123 |
+
"hei": ("h", "ei"),
|
124 |
+
"hen": ("h", "en"),
|
125 |
+
"heng": ("h", "eng"),
|
126 |
+
"hm": ("h", "m"),
|
127 |
+
"hng": ("h", "ng"),
|
128 |
+
"hong": ("h", "ong"),
|
129 |
+
"hou": ("h", "ou"),
|
130 |
+
"hu": ("h", "u"),
|
131 |
+
"hua": ("h", "ua"),
|
132 |
+
"huai": ("h", "uai"),
|
133 |
+
"huan": ("h", "uan"),
|
134 |
+
"huang": ("h", "uang"),
|
135 |
+
"hui": ("h", "ui"),
|
136 |
+
"hun": ("h", "un"),
|
137 |
+
"huo": ("h", "uo"),
|
138 |
+
"ji": ("j", "i"),
|
139 |
+
"jia": ("j", "ia"),
|
140 |
+
"jian": ("j", "ian"),
|
141 |
+
"jiang": ("j", "iang"),
|
142 |
+
"jiao": ("j", "iao"),
|
143 |
+
"jie": ("j", "ie"),
|
144 |
+
"jin": ("j", "in"),
|
145 |
+
"jing": ("j", "ing"),
|
146 |
+
"jiong": ("j", "iong"),
|
147 |
+
"jiu": ("j", "iu"),
|
148 |
+
"ju": ("j", "v"),
|
149 |
+
"juan": ("j", "van"),
|
150 |
+
"jue": ("j", "ve"),
|
151 |
+
"jun": ("j", "vn"),
|
152 |
+
"ka": ("k", "a"),
|
153 |
+
"kai": ("k", "ai"),
|
154 |
+
"kan": ("k", "an"),
|
155 |
+
"kang": ("k", "ang"),
|
156 |
+
"kao": ("k", "ao"),
|
157 |
+
"ke": ("k", "e"),
|
158 |
+
"kei": ("k", "ei"),
|
159 |
+
"ken": ("k", "en"),
|
160 |
+
"keng": ("k", "eng"),
|
161 |
+
"kong": ("k", "ong"),
|
162 |
+
"kou": ("k", "ou"),
|
163 |
+
"ku": ("k", "u"),
|
164 |
+
"kua": ("k", "ua"),
|
165 |
+
"kuai": ("k", "uai"),
|
166 |
+
"kuan": ("k", "uan"),
|
167 |
+
"kuang": ("k", "uang"),
|
168 |
+
"kui": ("k", "ui"),
|
169 |
+
"kun": ("k", "un"),
|
170 |
+
"kuo": ("k", "uo"),
|
171 |
+
"la": ("l", "a"),
|
172 |
+
"lai": ("l", "ai"),
|
173 |
+
"lan": ("l", "an"),
|
174 |
+
"lang": ("l", "ang"),
|
175 |
+
"lao": ("l", "ao"),
|
176 |
+
"le": ("l", "e"),
|
177 |
+
"lei": ("l", "ei"),
|
178 |
+
"leng": ("l", "eng"),
|
179 |
+
"li": ("l", "i"),
|
180 |
+
"lia": ("l", "ia"),
|
181 |
+
"lian": ("l", "ian"),
|
182 |
+
"liang": ("l", "iang"),
|
183 |
+
"liao": ("l", "iao"),
|
184 |
+
"lie": ("l", "ie"),
|
185 |
+
"lin": ("l", "in"),
|
186 |
+
"ling": ("l", "ing"),
|
187 |
+
"liu": ("l", "iu"),
|
188 |
+
"lo": ("l", "o"),
|
189 |
+
"long": ("l", "ong"),
|
190 |
+
"lou": ("l", "ou"),
|
191 |
+
"lu": ("l", "u"),
|
192 |
+
"luan": ("l", "uan"),
|
193 |
+
"lun": ("l", "un"),
|
194 |
+
"luo": ("l", "uo"),
|
195 |
+
"lv": ("l", "v"),
|
196 |
+
"lve": ("l", "ve"),
|
197 |
+
"m": ("m",),
|
198 |
+
"ma": ("m", "a"),
|
199 |
+
"mai": ("m", "ai"),
|
200 |
+
"man": ("m", "an"),
|
201 |
+
"mang": ("m", "ang"),
|
202 |
+
"mao": ("m", "ao"),
|
203 |
+
"me": ("m", "e"),
|
204 |
+
"mei": ("m", "ei"),
|
205 |
+
"men": ("m", "en"),
|
206 |
+
"meng": ("m", "eng"),
|
207 |
+
"mi": ("m", "i"),
|
208 |
+
"mian": ("m", "ian"),
|
209 |
+
"miao": ("m", "iao"),
|
210 |
+
"mie": ("m", "ie"),
|
211 |
+
"min": ("m", "in"),
|
212 |
+
"ming": ("m", "ing"),
|
213 |
+
"miu": ("m", "iu"),
|
214 |
+
"mo": ("m", "o"),
|
215 |
+
"mou": ("m", "ou"),
|
216 |
+
"mu": ("m", "u"),
|
217 |
+
"n": ("n",),
|
218 |
+
"na": ("n", "a"),
|
219 |
+
"nai": ("n", "ai"),
|
220 |
+
"nan": ("n", "an"),
|
221 |
+
"nang": ("n", "ang"),
|
222 |
+
"nao": ("n", "ao"),
|
223 |
+
"ne": ("n", "e"),
|
224 |
+
"nei": ("n", "ei"),
|
225 |
+
"nen": ("n", "en"),
|
226 |
+
"neng": ("n", "eng"),
|
227 |
+
"ng": ("n", "g"),
|
228 |
+
"ni": ("n", "i"),
|
229 |
+
"nian": ("n", "ian"),
|
230 |
+
"niang": ("n", "iang"),
|
231 |
+
"niao": ("n", "iao"),
|
232 |
+
"nie": ("n", "ie"),
|
233 |
+
"nin": ("n", "in"),
|
234 |
+
"ning": ("n", "ing"),
|
235 |
+
"niu": ("n", "iu"),
|
236 |
+
"nong": ("n", "ong"),
|
237 |
+
"nou": ("n", "ou"),
|
238 |
+
"nu": ("n", "u"),
|
239 |
+
"nuan": ("n", "uan"),
|
240 |
+
"nun": ("n", "un"),
|
241 |
+
"nuo": ("n", "uo"),
|
242 |
+
"nv": ("n", "v"),
|
243 |
+
"nve": ("n", "ve"),
|
244 |
+
"o": ("o",),
|
245 |
+
"ou": ("ou",),
|
246 |
+
"pa": ("p", "a"),
|
247 |
+
"pai": ("p", "ai"),
|
248 |
+
"pan": ("p", "an"),
|
249 |
+
"pang": ("p", "ang"),
|
250 |
+
"pao": ("p", "ao"),
|
251 |
+
"pei": ("p", "ei"),
|
252 |
+
"pen": ("p", "en"),
|
253 |
+
"peng": ("p", "eng"),
|
254 |
+
"pi": ("p", "i"),
|
255 |
+
"pian": ("p", "ian"),
|
256 |
+
"piao": ("p", "iao"),
|
257 |
+
"pie": ("p", "ie"),
|
258 |
+
"pin": ("p", "in"),
|
259 |
+
"ping": ("p", "ing"),
|
260 |
+
"po": ("p", "o"),
|
261 |
+
"pou": ("p", "ou"),
|
262 |
+
"pu": ("p", "u"),
|
263 |
+
"qi": ("q", "i"),
|
264 |
+
"qia": ("q", "ia"),
|
265 |
+
"qian": ("q", "ian"),
|
266 |
+
"qiang": ("q", "iang"),
|
267 |
+
"qiao": ("q", "iao"),
|
268 |
+
"qie": ("q", "ie"),
|
269 |
+
"qin": ("q", "in"),
|
270 |
+
"qing": ("q", "ing"),
|
271 |
+
"qiong": ("q", "iong"),
|
272 |
+
"qiu": ("q", "iu"),
|
273 |
+
"qu": ("q", "v"),
|
274 |
+
"quan": ("q", "van"),
|
275 |
+
"que": ("q", "ve"),
|
276 |
+
"qun": ("q", "vn"),
|
277 |
+
"ran": ("r", "an"),
|
278 |
+
"rang": ("r", "ang"),
|
279 |
+
"rao": ("r", "ao"),
|
280 |
+
"re": ("r", "e"),
|
281 |
+
"ren": ("r", "en"),
|
282 |
+
"reng": ("r", "eng"),
|
283 |
+
"ri": ("r", "i"),
|
284 |
+
"rong": ("r", "ong"),
|
285 |
+
"rou": ("r", "ou"),
|
286 |
+
"ru": ("r", "u"),
|
287 |
+
"rua": ("r", "ua"),
|
288 |
+
"ruan": ("r", "uan"),
|
289 |
+
"rui": ("r", "ui"),
|
290 |
+
"run": ("r", "un"),
|
291 |
+
"ruo": ("r", "uo"),
|
292 |
+
"sa": ("s", "a"),
|
293 |
+
"sai": ("s", "ai"),
|
294 |
+
"san": ("s", "an"),
|
295 |
+
"sang": ("s", "ang"),
|
296 |
+
"sao": ("s", "ao"),
|
297 |
+
"se": ("s", "e"),
|
298 |
+
"sen": ("s", "en"),
|
299 |
+
"seng": ("s", "eng"),
|
300 |
+
"sha": ("sh", "a"),
|
301 |
+
"shai": ("sh", "ai"),
|
302 |
+
"shan": ("sh", "an"),
|
303 |
+
"shang": ("sh", "ang"),
|
304 |
+
"shao": ("sh", "ao"),
|
305 |
+
"she": ("sh", "e"),
|
306 |
+
"shei": ("sh", "ei"),
|
307 |
+
"shen": ("sh", "en"),
|
308 |
+
"sheng": ("sh", "eng"),
|
309 |
+
"shi": ("sh", "i"),
|
310 |
+
"shou": ("sh", "ou"),
|
311 |
+
"shu": ("sh", "u"),
|
312 |
+
"shua": ("sh", "ua"),
|
313 |
+
"shuai": ("sh", "uai"),
|
314 |
+
"shuan": ("sh", "uan"),
|
315 |
+
"shuang": ("sh", "uang"),
|
316 |
+
"shui": ("sh", "ui"),
|
317 |
+
"shun": ("sh", "un"),
|
318 |
+
"shuo": ("sh", "uo"),
|
319 |
+
"si": ("s", "i"),
|
320 |
+
"song": ("s", "ong"),
|
321 |
+
"sou": ("s", "ou"),
|
322 |
+
"su": ("s", "u"),
|
323 |
+
"suan": ("s", "uan"),
|
324 |
+
"sui": ("s", "ui"),
|
325 |
+
"sun": ("s", "un"),
|
326 |
+
"suo": ("s", "uo"),
|
327 |
+
"ta": ("t", "a"),
|
328 |
+
"tai": ("t", "ai"),
|
329 |
+
"tan": ("t", "an"),
|
330 |
+
"tang": ("t", "ang"),
|
331 |
+
"tao": ("t", "ao"),
|
332 |
+
"te": ("t", "e"),
|
333 |
+
"tei": ("t", "ei"),
|
334 |
+
"teng": ("t", "eng"),
|
335 |
+
"ti": ("t", "i"),
|
336 |
+
"tian": ("t", "ian"),
|
337 |
+
"tiao": ("t", "iao"),
|
338 |
+
"tie": ("t", "ie"),
|
339 |
+
"ting": ("t", "ing"),
|
340 |
+
"tong": ("t", "ong"),
|
341 |
+
"tou": ("t", "ou"),
|
342 |
+
"tu": ("t", "u"),
|
343 |
+
"tuan": ("t", "uan"),
|
344 |
+
"tui": ("t", "ui"),
|
345 |
+
"tun": ("t", "un"),
|
346 |
+
"tuo": ("t", "uo"),
|
347 |
+
"wa": ("w", "a"),
|
348 |
+
"wai": ("w", "ai"),
|
349 |
+
"wan": ("w", "an"),
|
350 |
+
"wang": ("w", "ang"),
|
351 |
+
"wei": ("w", "ei"),
|
352 |
+
"wen": ("w", "en"),
|
353 |
+
"weng": ("w", "eng"),
|
354 |
+
"wo": ("w", "o"),
|
355 |
+
"wu": ("w", "u"),
|
356 |
+
"xi": ("x", "i"),
|
357 |
+
"xia": ("x", "ia"),
|
358 |
+
"xian": ("x", "ian"),
|
359 |
+
"xiang": ("x", "iang"),
|
360 |
+
"xiao": ("x", "iao"),
|
361 |
+
"xie": ("x", "ie"),
|
362 |
+
"xin": ("x", "in"),
|
363 |
+
"xing": ("x", "ing"),
|
364 |
+
"xiong": ("x", "iong"),
|
365 |
+
"xiu": ("x", "iu"),
|
366 |
+
"xu": ("x", "v"),
|
367 |
+
"xuan": ("x", "van"),
|
368 |
+
"xue": ("x", "ve"),
|
369 |
+
"xun": ("x", "vn"),
|
370 |
+
"ya": ("y", "a"),
|
371 |
+
"yan": ("y", "an"),
|
372 |
+
"yang": ("y", "ang"),
|
373 |
+
"yao": ("y", "ao"),
|
374 |
+
"ye": ("y", "e"),
|
375 |
+
"yi": ("y", "i"),
|
376 |
+
"yin": ("y", "in"),
|
377 |
+
"ying": ("y", "ing"),
|
378 |
+
"yo": ("y", "o"),
|
379 |
+
"yong": ("y", "ong"),
|
380 |
+
"you": ("y", "ou"),
|
381 |
+
"yu": ("y", "v"),
|
382 |
+
"yuan": ("y", "van"),
|
383 |
+
"yue": ("y", "ve"),
|
384 |
+
"yun": ("y", "vn"),
|
385 |
+
"za": ("z", "a"),
|
386 |
+
"zai": ("z", "ai"),
|
387 |
+
"zan": ("z", "an"),
|
388 |
+
"zang": ("z", "ang"),
|
389 |
+
"zao": ("z", "ao"),
|
390 |
+
"ze": ("z", "e"),
|
391 |
+
"zei": ("z", "ei"),
|
392 |
+
"zen": ("z", "en"),
|
393 |
+
"zeng": ("z", "eng"),
|
394 |
+
"zha": ("zh", "a"),
|
395 |
+
"zhai": ("zh", "ai"),
|
396 |
+
"zhan": ("zh", "an"),
|
397 |
+
"zhang": ("zh", "ang"),
|
398 |
+
"zhao": ("zh", "ao"),
|
399 |
+
"zhe": ("zh", "e"),
|
400 |
+
"zhei": ("zh", "ei"),
|
401 |
+
"zhen": ("zh", "en"),
|
402 |
+
"zheng": ("zh", "eng"),
|
403 |
+
"zhi": ("zh", "i"),
|
404 |
+
"zhong": ("zh", "ong"),
|
405 |
+
"zhou": ("zh", "ou"),
|
406 |
+
"zhu": ("zh", "u"),
|
407 |
+
"zhua": ("zh", "ua"),
|
408 |
+
"zhuai": ("zh", "uai"),
|
409 |
+
"zhuan": ("zh", "uan"),
|
410 |
+
"zhuang": ("zh", "uang"),
|
411 |
+
"zhui": ("zh", "ui"),
|
412 |
+
"zhun": ("zh", "un"),
|
413 |
+
"zhuo": ("zh", "uo"),
|
414 |
+
"zi": ("z", "i"),
|
415 |
+
"zong": ("z", "ong"),
|
416 |
+
"zou": ("z", "ou"),
|
417 |
+
"zu": ("z", "u"),
|
418 |
+
"zuan": ("z", "uan"),
|
419 |
+
"zui": ("z", "ui"),
|
420 |
+
"zun": ("z", "un"),
|
421 |
+
"zuo": ("z", "uo"),
|
422 |
+
}
|
423 |
+
|
{resource → resources}/singer/singer_embedding_ace-1.npy
RENAMED
File without changes
|
{resource → resources}/singer/singer_embedding_ace-10.npy
RENAMED
File without changes
|
{resource → resources}/singer/singer_embedding_ace-11.npy
RENAMED
File without changes
|
{resource → resources}/singer/singer_embedding_ace-12.npy
RENAMED
File without changes
|
{resource → resources}/singer/singer_embedding_ace-13.npy
RENAMED
File without changes
|
{resource → resources}/singer/singer_embedding_ace-14.npy
RENAMED
File without changes
|
{resource → resources}/singer/singer_embedding_ace-15.npy
RENAMED
File without changes
|
{resource → resources}/singer/singer_embedding_ace-16.npy
RENAMED
File without changes
|
{resource → resources}/singer/singer_embedding_ace-17.npy
RENAMED
File without changes
|
{resource → resources}/singer/singer_embedding_ace-18.npy
RENAMED
File without changes
|
{resource → resources}/singer/singer_embedding_ace-19.npy
RENAMED
File without changes
|