Spaces:
Sleeping
Sleeping
Merge branch 'refactor' into hf
Browse files- .gitattributes +2 -0
- app.py +2 -1
- characters/Limei.py +1 -1
- characters/Yaoyin.py +1 -1
- characters/base.py +1 -1
- cli.py +46 -0
- config/cli/limei_default.yaml +16 -0
- config/cli/yaoyin_default.yaml +16 -0
- config/{default.yaml → interface/default.yaml} +0 -0
- config/interface/options.yaml +63 -0
- config/options.yaml +2 -0
- evaluation/svs_eval.py +10 -13
- interface.py +41 -29
- modules/asr.py +1 -2
- modules/llm.py +6 -2
- modules/svs/base.py +2 -0
- modules/svs/espnet.py +3 -3
- pipeline.py +4 -4
- requirements.txt +4 -1
- tests/audio/chat.wav +3 -0
- tests/audio/feeling.wav +3 -0
- tests/audio/hello.wav +3 -0
- tests/audio/interesting.wav +3 -0
- tests/audio/music.wav +3 -0
- tests/audio/where_from.wav +3 -0
.gitattributes
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
*.png filter=lfs diff=lfs merge=lfs -text
|
2 |
*.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
1 |
*.png filter=lfs diff=lfs merge=lfs -text
|
2 |
*.jpg filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -9,7 +9,8 @@ from interface import GradioInterface
|
|
9 |
|
10 |
def main():
|
11 |
demo = GradioInterface(
|
12 |
-
options_config="config/options.yaml",
|
|
|
13 |
).create_interface()
|
14 |
demo.launch()
|
15 |
|
|
|
9 |
|
10 |
def main():
|
11 |
demo = GradioInterface(
|
12 |
+
options_config="config/interface/options.yaml",
|
13 |
+
default_config="config/interface/default.yaml",
|
14 |
).create_interface()
|
15 |
demo.launch()
|
16 |
|
characters/Limei.py
CHANGED
@@ -5,7 +5,7 @@ def get_character():
|
|
5 |
return Character(
|
6 |
name="Limei (丽梅)",
|
7 |
image_path="assets/character_limei.png",
|
8 |
-
|
9 |
prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
|
11 |
|
|
|
5 |
return Character(
|
6 |
name="Limei (丽梅)",
|
7 |
image_path="assets/character_limei.png",
|
8 |
+
default_voice="voice1",
|
9 |
prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
|
11 |
|
characters/Yaoyin.py
CHANGED
@@ -5,7 +5,7 @@ def get_character():
|
|
5 |
return Character(
|
6 |
name="Yaoyin (遥音)",
|
7 |
image_path="assets/character_yaoyin.jpg",
|
8 |
-
|
9 |
prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
|
11 |
|
|
|
5 |
return Character(
|
6 |
name="Yaoyin (遥音)",
|
7 |
image_path="assets/character_yaoyin.jpg",
|
8 |
+
default_voice="voice2",
|
9 |
prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
|
11 |
|
characters/base.py
CHANGED
@@ -5,5 +5,5 @@ from dataclasses import dataclass
|
|
5 |
class Character:
|
6 |
name: str
|
7 |
image_path: str
|
8 |
-
|
9 |
prompt: str
|
|
|
5 |
class Character:
|
6 |
name: str
|
7 |
image_path: str
|
8 |
+
default_voice: str
|
9 |
prompt: str
|
cli.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from argparse import ArgumentParser
|
2 |
+
from logging import getLogger
|
3 |
+
|
4 |
+
import soundfile as sf
|
5 |
+
import yaml
|
6 |
+
|
7 |
+
from characters import CHARACTERS
|
8 |
+
from pipeline import SingingDialoguePipeline
|
9 |
+
|
10 |
+
logger = getLogger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
def get_parser():
|
14 |
+
parser = ArgumentParser()
|
15 |
+
parser.add_argument("--query_audio", type=str, required=True)
|
16 |
+
parser.add_argument("--config_path", type=str, default="config/cli/yaoyin_default.yaml")
|
17 |
+
parser.add_argument("--output_audio", type=str, required=True)
|
18 |
+
return parser
|
19 |
+
|
20 |
+
|
21 |
+
def load_config(config_path: str):
|
22 |
+
with open(config_path, "r") as f:
|
23 |
+
config = yaml.safe_load(f)
|
24 |
+
return config
|
25 |
+
|
26 |
+
|
27 |
+
def main():
|
28 |
+
parser = get_parser()
|
29 |
+
args = parser.parse_args()
|
30 |
+
config = load_config(args.config_path)
|
31 |
+
pipeline = SingingDialoguePipeline(config)
|
32 |
+
speaker = config["speaker"]
|
33 |
+
language = config["language"]
|
34 |
+
character_name = config["prompt_template_character"]
|
35 |
+
character = CHARACTERS[character_name]
|
36 |
+
prompt_template = character.prompt
|
37 |
+
results = pipeline.run(args.query_audio, language, prompt_template, speaker)
|
38 |
+
logger.info(
|
39 |
+
f"Input: {args.query_audio}, Output: {args.output_audio}, ASR results: {results['asr_text']}, LLM results: {results['llm_text']}"
|
40 |
+
)
|
41 |
+
svs_audio, svs_sample_rate = results["svs_audio"]
|
42 |
+
sf.write(args.output_audio, svs_audio, svs_sample_rate)
|
43 |
+
|
44 |
+
|
45 |
+
if __name__ == "__main__":
|
46 |
+
main()
|
config/cli/limei_default.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_model: openai/whisper-large-v3-turbo
|
2 |
+
llm_model: google/gemma-2-2b
|
3 |
+
svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
4 |
+
melody_source: sample-lyric-kising
|
5 |
+
language: mandarin
|
6 |
+
prompt_template_character: Limei
|
7 |
+
speaker: 5
|
8 |
+
cache_dir: .cache
|
9 |
+
|
10 |
+
track_latency: True
|
11 |
+
evaluators:
|
12 |
+
svs:
|
13 |
+
- singmos
|
14 |
+
- per
|
15 |
+
- melody
|
16 |
+
- aesthetic
|
config/cli/yaoyin_default.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_model: openai/whisper-large-v3-turbo
|
2 |
+
llm_model: google/gemma-2-2b
|
3 |
+
svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
4 |
+
melody_source: sample-lyric-kising
|
5 |
+
language: mandarin
|
6 |
+
prompt_template_character: Yaoyin
|
7 |
+
speaker: 9
|
8 |
+
cache_dir: .cache
|
9 |
+
|
10 |
+
track_latency: True
|
11 |
+
evaluators:
|
12 |
+
svs:
|
13 |
+
- singmos
|
14 |
+
- per
|
15 |
+
- melody
|
16 |
+
- aesthetic
|
config/{default.yaml → interface/default.yaml}
RENAMED
File without changes
|
config/interface/options.yaml
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_models:
|
2 |
+
- id: openai/whisper-large-v3-turbo
|
3 |
+
name: Whisper large-v3-turbo
|
4 |
+
- id: openai/whisper-large-v3
|
5 |
+
name: Whisper large-v3
|
6 |
+
- id: openai/whisper-medium
|
7 |
+
name: Whisper medium
|
8 |
+
- id: sanchit-gandhi/whisper-small-dv
|
9 |
+
name: Whisper small-dv
|
10 |
+
- id: facebook/wav2vec2-base-960h
|
11 |
+
name: Wav2Vec2-Base-960h
|
12 |
+
|
13 |
+
llm_models:
|
14 |
+
- id: google/gemma-2-2b
|
15 |
+
name: Gemma 2 2B
|
16 |
+
- id: MiniMaxAI/MiniMax-M1-80k
|
17 |
+
name: MiniMax M1 80k
|
18 |
+
|
19 |
+
svs_models:
|
20 |
+
- id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
21 |
+
name: Visinger2 (Bilingual)-zh
|
22 |
+
model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
23 |
+
lang: mandarin
|
24 |
+
voices:
|
25 |
+
voice1: resource/singer/singer_embedding_ace-2.npy
|
26 |
+
voice2: resource/singer/singer_embedding_ace-8.npy
|
27 |
+
voice3: resource/singer/singer_embedding_itako.npy
|
28 |
+
voice4: resource/singer/singer_embedding_kising_orange.npy
|
29 |
+
voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
|
30 |
+
- id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
31 |
+
name: Visinger2 (Bilingual)-jp
|
32 |
+
model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
33 |
+
lang: japanese
|
34 |
+
voices:
|
35 |
+
voice1: resource/singer/singer_embedding_ace-2.npy
|
36 |
+
voice2: resource/singer/singer_embedding_ace-8.npy
|
37 |
+
voice3: resource/singer/singer_embedding_itako.npy
|
38 |
+
voice4: resource/singer/singer_embedding_kising_orange.npy
|
39 |
+
voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
|
40 |
+
- id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
41 |
+
name: Visinger2 (Chinese)
|
42 |
+
model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
43 |
+
lang: mandarin
|
44 |
+
voices:
|
45 |
+
voice1: 5
|
46 |
+
voice2: 8
|
47 |
+
voice3: 12
|
48 |
+
voice4: 15
|
49 |
+
voice5: 29
|
50 |
+
|
51 |
+
melody_sources:
|
52 |
+
- id: gen-random-none
|
53 |
+
name: Random Generation
|
54 |
+
desc: "Melody is generated without any structure or reference."
|
55 |
+
- id: sample-note-kising
|
56 |
+
name: Sampled Melody (KiSing)
|
57 |
+
desc: "Melody is retrieved from KiSing dataset."
|
58 |
+
- id: sample-note-touhou
|
59 |
+
name: Sampled Melody (Touhou)
|
60 |
+
desc: "Melody is retrieved from Touhou dataset."
|
61 |
+
- id: sample-lyric-kising
|
62 |
+
name: Sampled Melody with Lyrics (Kising)
|
63 |
+
desc: "Melody with aligned lyrics are sampled from Kising dataset."
|
config/options.yaml
CHANGED
@@ -15,6 +15,8 @@ llm_models:
|
|
15 |
name: Gemma 2 2B
|
16 |
- id: MiniMaxAI/MiniMax-M1-80k
|
17 |
name: MiniMax M1 80k
|
|
|
|
|
18 |
|
19 |
svs_models:
|
20 |
- id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
|
|
15 |
name: Gemma 2 2B
|
16 |
- id: MiniMaxAI/MiniMax-M1-80k
|
17 |
name: MiniMax M1 80k
|
18 |
+
- id: meta-llama/Llama-3.2-3B-Instruct
|
19 |
+
name: Llama 3.2 3B Instruct
|
20 |
|
21 |
svs_models:
|
22 |
- id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
evaluation/svs_eval.py
CHANGED
@@ -37,7 +37,8 @@ def init_audiobox_aesthetics():
|
|
37 |
# ----------- Evaluation -----------
|
38 |
|
39 |
|
40 |
-
def eval_singmos(
|
|
|
41 |
wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
|
42 |
wav_tensor = torch.from_numpy(wav).unsqueeze(0)
|
43 |
length_tensor = torch.tensor([wav_tensor.shape[1]])
|
@@ -71,7 +72,8 @@ def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
|
|
71 |
return np.mean(dissonant) if intervals else np.nan
|
72 |
|
73 |
|
74 |
-
def eval_per(
|
|
|
75 |
# TODO: implement PER evaluation
|
76 |
return {}
|
77 |
|
@@ -97,20 +99,16 @@ def load_evaluators(config):
|
|
97 |
return loaded
|
98 |
|
99 |
|
100 |
-
def run_evaluation(
|
101 |
results = {}
|
102 |
if "singmos" in evaluators:
|
103 |
-
results.update(eval_singmos(
|
104 |
if "per" in evaluators:
|
105 |
-
results.update(eval_per(
|
106 |
-
# create a tmp file with unique name
|
107 |
-
tmp_path = Path(".tmp") / f"{uuid.uuid4()}.wav"
|
108 |
-
sf.write(tmp_path, audio_array, sr)
|
109 |
if "melody" in evaluators:
|
110 |
-
results.update(eval_melody_metrics(
|
111 |
if "aesthetic" in evaluators:
|
112 |
-
results.update(eval_aesthetic(
|
113 |
-
tmp_path.unlink()
|
114 |
return results
|
115 |
|
116 |
|
@@ -122,9 +120,8 @@ if __name__ == "__main__":
|
|
122 |
parser.add_argument("--results_csv", type=str, required=True)
|
123 |
parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
|
124 |
args = parser.parse_args()
|
125 |
-
audio_array, sr = librosa.load(args.wav_path, sr=None)
|
126 |
evaluators = load_evaluators(args.evaluators.split(","))
|
127 |
-
results = run_evaluation(
|
128 |
print(results)
|
129 |
|
130 |
with open(args.results_csv, "a") as f:
|
|
|
37 |
# ----------- Evaluation -----------
|
38 |
|
39 |
|
40 |
+
def eval_singmos(audio_path, predictor):
|
41 |
+
audio_array, sr = librosa.load(audio_path, sr=44100)
|
42 |
wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
|
43 |
wav_tensor = torch.from_numpy(wav).unsqueeze(0)
|
44 |
length_tensor = torch.tensor([wav_tensor.shape[1]])
|
|
|
72 |
return np.mean(dissonant) if intervals else np.nan
|
73 |
|
74 |
|
75 |
+
def eval_per(audio_path, model=None):
|
76 |
+
audio_array, sr = librosa.load(audio_path, sr=16000)
|
77 |
# TODO: implement PER evaluation
|
78 |
return {}
|
79 |
|
|
|
99 |
return loaded
|
100 |
|
101 |
|
102 |
+
def run_evaluation(audio_path, evaluators):
|
103 |
results = {}
|
104 |
if "singmos" in evaluators:
|
105 |
+
results.update(eval_singmos(audio_path, evaluators["singmos"]))
|
106 |
if "per" in evaluators:
|
107 |
+
results.update(eval_per(audio_path, evaluators["per"]))
|
|
|
|
|
|
|
108 |
if "melody" in evaluators:
|
109 |
+
results.update(eval_melody_metrics(audio_path, evaluators["melody"]))
|
110 |
if "aesthetic" in evaluators:
|
111 |
+
results.update(eval_aesthetic(audio_path, evaluators["aesthetic"]))
|
|
|
112 |
return results
|
113 |
|
114 |
|
|
|
120 |
parser.add_argument("--results_csv", type=str, required=True)
|
121 |
parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
|
122 |
args = parser.parse_args()
|
|
|
123 |
evaluators = load_evaluators(args.evaluators.split(","))
|
124 |
+
results = run_evaluation(args.wav_path, evaluators)
|
125 |
print(results)
|
126 |
|
127 |
with open(args.results_csv, "a") as f:
|
interface.py
CHANGED
@@ -17,8 +17,8 @@ class GradioInterface:
|
|
17 |
self.current_svs_model = (
|
18 |
f"{self.default_config['language']}-{self.default_config['svs_model']}"
|
19 |
)
|
20 |
-
self.
|
21 |
-
self.character_info[self.current_character].
|
22 |
]
|
23 |
self.pipeline = SingingDialoguePipeline(self.default_config)
|
24 |
|
@@ -104,21 +104,21 @@ class GradioInterface:
|
|
104 |
value=self.current_svs_model,
|
105 |
)
|
106 |
with gr.Row():
|
107 |
-
|
108 |
-
label="Singing
|
109 |
choices=list(
|
110 |
self.svs_model_map[self.current_svs_model][
|
111 |
-
"
|
112 |
].keys()
|
113 |
),
|
114 |
value=self.character_info[
|
115 |
self.current_character
|
116 |
-
].
|
117 |
)
|
118 |
character_radio.change(
|
119 |
fn=self.update_character,
|
120 |
inputs=character_radio,
|
121 |
-
outputs=[character_image,
|
122 |
)
|
123 |
asr_radio.change(
|
124 |
fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
|
@@ -129,35 +129,41 @@ class GradioInterface:
|
|
129 |
svs_radio.change(
|
130 |
fn=self.update_svs_model,
|
131 |
inputs=svs_radio,
|
132 |
-
outputs=[svs_radio,
|
133 |
)
|
134 |
melody_radio.change(
|
135 |
fn=self.update_melody_source,
|
136 |
inputs=melody_radio,
|
137 |
outputs=melody_radio,
|
138 |
)
|
139 |
-
|
140 |
-
fn=self.
|
141 |
)
|
142 |
mic_input.change(
|
143 |
fn=self.run_pipeline,
|
144 |
inputs=mic_input,
|
145 |
outputs=[interaction_log, audio_output],
|
146 |
)
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
return demo
|
149 |
except Exception as e:
|
150 |
print(f"error: {e}")
|
151 |
breakpoint()
|
|
|
152 |
|
153 |
def update_character(self, character):
|
154 |
self.current_character = character
|
155 |
-
|
156 |
-
self.
|
157 |
-
|
158 |
]
|
159 |
return gr.update(value=self.character_info[character].image_path), gr.update(
|
160 |
-
value=
|
161 |
)
|
162 |
|
163 |
def update_asr_model(self, asr_model):
|
@@ -170,23 +176,23 @@ class GradioInterface:
|
|
170 |
|
171 |
def update_svs_model(self, svs_model):
|
172 |
self.current_svs_model = svs_model
|
173 |
-
|
174 |
-
self.
|
175 |
-
|
176 |
]
|
177 |
self.pipeline.set_svs_model(
|
178 |
self.svs_model_map[self.current_svs_model]["model_path"]
|
179 |
)
|
180 |
print(
|
181 |
-
f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and
|
182 |
)
|
183 |
return (
|
184 |
gr.update(value=svs_model),
|
185 |
gr.update(
|
186 |
choices=list(
|
187 |
-
self.svs_model_map[self.current_svs_model]["
|
188 |
),
|
189 |
-
value=
|
190 |
),
|
191 |
)
|
192 |
|
@@ -194,24 +200,30 @@ class GradioInterface:
|
|
194 |
self.current_melody_source = melody_source
|
195 |
return gr.update(value=self.current_melody_source)
|
196 |
|
197 |
-
def
|
198 |
-
self.
|
199 |
-
|
200 |
]
|
201 |
-
return gr.update(value=
|
202 |
|
203 |
def run_pipeline(self, audio_path):
|
|
|
|
|
204 |
results = self.pipeline.run(
|
205 |
audio_path,
|
206 |
self.svs_model_map[self.current_svs_model]["lang"],
|
207 |
self.character_info[self.current_character].prompt,
|
208 |
-
|
209 |
-
"speaker": self.current_timbre,
|
210 |
-
},
|
211 |
max_new_tokens=100,
|
212 |
)
|
213 |
formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
|
214 |
return gr.update(value=formatted_logs), gr.update(value=results["svs_audio"])
|
215 |
|
216 |
-
def
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
self.current_svs_model = (
|
18 |
f"{self.default_config['language']}-{self.default_config['svs_model']}"
|
19 |
)
|
20 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
|
21 |
+
self.character_info[self.current_character].default_voice
|
22 |
]
|
23 |
self.pipeline = SingingDialoguePipeline(self.default_config)
|
24 |
|
|
|
104 |
value=self.current_svs_model,
|
105 |
)
|
106 |
with gr.Row():
|
107 |
+
voice_radio = gr.Radio(
|
108 |
+
label="Singing voice",
|
109 |
choices=list(
|
110 |
self.svs_model_map[self.current_svs_model][
|
111 |
+
"voices"
|
112 |
].keys()
|
113 |
),
|
114 |
value=self.character_info[
|
115 |
self.current_character
|
116 |
+
].default_voice,
|
117 |
)
|
118 |
character_radio.change(
|
119 |
fn=self.update_character,
|
120 |
inputs=character_radio,
|
121 |
+
outputs=[character_image, voice_radio],
|
122 |
)
|
123 |
asr_radio.change(
|
124 |
fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
|
|
|
129 |
svs_radio.change(
|
130 |
fn=self.update_svs_model,
|
131 |
inputs=svs_radio,
|
132 |
+
outputs=[svs_radio, voice_radio],
|
133 |
)
|
134 |
melody_radio.change(
|
135 |
fn=self.update_melody_source,
|
136 |
inputs=melody_radio,
|
137 |
outputs=melody_radio,
|
138 |
)
|
139 |
+
voice_radio.change(
|
140 |
+
fn=self.update_voice, inputs=voice_radio, outputs=voice_radio
|
141 |
)
|
142 |
mic_input.change(
|
143 |
fn=self.run_pipeline,
|
144 |
inputs=mic_input,
|
145 |
outputs=[interaction_log, audio_output],
|
146 |
)
|
147 |
+
metrics_button.click(
|
148 |
+
fn=self.update_metrics,
|
149 |
+
inputs=audio_output,
|
150 |
+
outputs=[metrics_output],
|
151 |
+
)
|
152 |
|
153 |
return demo
|
154 |
except Exception as e:
|
155 |
print(f"error: {e}")
|
156 |
breakpoint()
|
157 |
+
return gr.Blocks()
|
158 |
|
159 |
def update_character(self, character):
|
160 |
self.current_character = character
|
161 |
+
character_voice = self.character_info[self.current_character].default_voice
|
162 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
|
163 |
+
character_voice
|
164 |
]
|
165 |
return gr.update(value=self.character_info[character].image_path), gr.update(
|
166 |
+
value=character_voice
|
167 |
)
|
168 |
|
169 |
def update_asr_model(self, asr_model):
|
|
|
176 |
|
177 |
def update_svs_model(self, svs_model):
|
178 |
self.current_svs_model = svs_model
|
179 |
+
character_voice = self.character_info[self.current_character].default_voice
|
180 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
|
181 |
+
character_voice
|
182 |
]
|
183 |
self.pipeline.set_svs_model(
|
184 |
self.svs_model_map[self.current_svs_model]["model_path"]
|
185 |
)
|
186 |
print(
|
187 |
+
f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and voice_radio to {character_voice}"
|
188 |
)
|
189 |
return (
|
190 |
gr.update(value=svs_model),
|
191 |
gr.update(
|
192 |
choices=list(
|
193 |
+
self.svs_model_map[self.current_svs_model]["voices"].keys()
|
194 |
),
|
195 |
+
value=character_voice,
|
196 |
),
|
197 |
)
|
198 |
|
|
|
200 |
self.current_melody_source = melody_source
|
201 |
return gr.update(value=self.current_melody_source)
|
202 |
|
203 |
+
def update_voice(self, voice):
|
204 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
|
205 |
+
voice
|
206 |
]
|
207 |
+
return gr.update(value=voice)
|
208 |
|
209 |
def run_pipeline(self, audio_path):
|
210 |
+
if not audio_path:
|
211 |
+
return gr.update(value=""), gr.update(value="")
|
212 |
results = self.pipeline.run(
|
213 |
audio_path,
|
214 |
self.svs_model_map[self.current_svs_model]["lang"],
|
215 |
self.character_info[self.current_character].prompt,
|
216 |
+
self.current_voice,
|
|
|
|
|
217 |
max_new_tokens=100,
|
218 |
)
|
219 |
formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
|
220 |
return gr.update(value=formatted_logs), gr.update(value=results["svs_audio"])
|
221 |
|
222 |
+
def update_metrics(self, audio_path):
|
223 |
+
if not audio_path:
|
224 |
+
return gr.update(value="")
|
225 |
+
results = self.pipeline.evaluate(audio_path)
|
226 |
+
formatted_metrics = "\n".join(
|
227 |
+
[f"{k}: {v}" for k, v in results.items()]
|
228 |
+
)
|
229 |
+
return gr.update(value=formatted_metrics)
|
modules/asr.py
CHANGED
@@ -10,14 +10,13 @@ hf_token = os.getenv("HF_TOKEN")
|
|
10 |
|
11 |
|
12 |
class AbstractASRModel(ABC):
|
13 |
-
@abstractmethod
|
14 |
def __init__(
|
15 |
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
16 |
):
|
|
|
17 |
self.model_id = model_id
|
18 |
self.device = device
|
19 |
self.cache_dir = cache_dir
|
20 |
-
pass
|
21 |
|
22 |
@abstractmethod
|
23 |
def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
|
|
|
10 |
|
11 |
|
12 |
class AbstractASRModel(ABC):
|
|
|
13 |
def __init__(
|
14 |
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
15 |
):
|
16 |
+
print(f"Loading ASR model {model_id}...")
|
17 |
self.model_id = model_id
|
18 |
self.device = device
|
19 |
self.cache_dir = cache_dir
|
|
|
20 |
|
21 |
@abstractmethod
|
22 |
def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
|
modules/llm.py
CHANGED
@@ -8,10 +8,13 @@ hf_token = os.getenv("HF_TOKEN")
|
|
8 |
|
9 |
|
10 |
class AbstractLLMModel(ABC):
|
11 |
-
@abstractmethod
|
12 |
def __init__(
|
13 |
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
14 |
-
):
|
|
|
|
|
|
|
|
|
15 |
|
16 |
@abstractmethod
|
17 |
def generate(self, prompt: str, **kwargs) -> str:
|
@@ -41,6 +44,7 @@ class HFTextGenerationLLM(AbstractLLMModel):
|
|
41 |
def __init__(
|
42 |
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
43 |
):
|
|
|
44 |
model_kwargs = kwargs.setdefault("model_kwargs", {})
|
45 |
model_kwargs["cache_dir"] = cache_dir
|
46 |
self.pipe = pipeline(
|
|
|
8 |
|
9 |
|
10 |
class AbstractLLMModel(ABC):
|
|
|
11 |
def __init__(
|
12 |
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
13 |
+
):
|
14 |
+
print(f"Loading LLM model {model_id}...")
|
15 |
+
self.model_id = model_id
|
16 |
+
self.device = device
|
17 |
+
self.cache_dir = cache_dir
|
18 |
|
19 |
@abstractmethod
|
20 |
def generate(self, prompt: str, **kwargs) -> str:
|
|
|
44 |
def __init__(
|
45 |
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
46 |
):
|
47 |
+
super().__init__(model_id, device, cache_dir, **kwargs)
|
48 |
model_kwargs = kwargs.setdefault("model_kwargs", {})
|
49 |
model_kwargs["cache_dir"] = cache_dir
|
50 |
self.pipe = pipeline(
|
modules/svs/base.py
CHANGED
@@ -13,6 +13,8 @@ class AbstractSVSModel(ABC):
|
|
13 |
def synthesize(
|
14 |
self,
|
15 |
score: list[tuple[float, float, str, int]],
|
|
|
|
|
16 |
**kwargs,
|
17 |
) -> tuple[np.ndarray, int]:
|
18 |
"""
|
|
|
13 |
def synthesize(
|
14 |
self,
|
15 |
score: list[tuple[float, float, str, int]],
|
16 |
+
language: str,
|
17 |
+
speaker: str,
|
18 |
**kwargs,
|
19 |
) -> tuple[np.ndarray, int]:
|
20 |
"""
|
modules/svs/espnet.py
CHANGED
@@ -99,11 +99,11 @@ class ESPNetSVS(AbstractSVSModel):
|
|
99 |
return batch
|
100 |
|
101 |
def synthesize(
|
102 |
-
self, score: list[tuple[float, float, str, int]], language: str, **kwargs
|
103 |
):
|
104 |
batch = self._preprocess(score, language)
|
105 |
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
106 |
-
sid = np.array([int(
|
107 |
output_dict = self.model(batch, sids=sid)
|
108 |
elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
|
109 |
langs = {
|
@@ -115,7 +115,7 @@ class ESPNetSVS(AbstractSVSModel):
|
|
115 |
f"Unsupported language: {language} for {self.model_id}"
|
116 |
)
|
117 |
lid = np.array([langs[language]])
|
118 |
-
spk_embed = np.load(
|
119 |
output_dict = self.model(batch, lids=lid, spembs=spk_embed)
|
120 |
else:
|
121 |
raise NotImplementedError(f"Model {self.model_id} not supported")
|
|
|
99 |
return batch
|
100 |
|
101 |
def synthesize(
|
102 |
+
self, score: list[tuple[float, float, str, int]], language: str, speaker: str, **kwargs
|
103 |
):
|
104 |
batch = self._preprocess(score, language)
|
105 |
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
106 |
+
sid = np.array([int(speaker)])
|
107 |
output_dict = self.model(batch, sids=sid)
|
108 |
elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
|
109 |
langs = {
|
|
|
115 |
f"Unsupported language: {language} for {self.model_id}"
|
116 |
)
|
117 |
lid = np.array([langs[language]])
|
118 |
+
spk_embed = np.load(speaker)
|
119 |
output_dict = self.model(batch, lids=lid, spembs=spk_embed)
|
120 |
else:
|
121 |
raise NotImplementedError(f"Model {self.model_id} not supported")
|
pipeline.py
CHANGED
@@ -55,7 +55,7 @@ class SingingDialoguePipeline:
|
|
55 |
audio_path,
|
56 |
language,
|
57 |
prompt_template,
|
58 |
-
|
59 |
max_new_tokens=100,
|
60 |
):
|
61 |
if self.track_latency:
|
@@ -81,7 +81,7 @@ class SingingDialoguePipeline:
|
|
81 |
if self.track_latency:
|
82 |
svs_start_time = time.time()
|
83 |
singing_audio, sample_rate = self.svs.synthesize(
|
84 |
-
score, language=language,
|
85 |
)
|
86 |
if self.track_latency:
|
87 |
svs_end_time = time.time()
|
@@ -99,5 +99,5 @@ class SingingDialoguePipeline:
|
|
99 |
})
|
100 |
return results
|
101 |
|
102 |
-
def evaluate(self,
|
103 |
-
return run_evaluation(
|
|
|
55 |
audio_path,
|
56 |
language,
|
57 |
prompt_template,
|
58 |
+
speaker,
|
59 |
max_new_tokens=100,
|
60 |
):
|
61 |
if self.track_latency:
|
|
|
81 |
if self.track_latency:
|
82 |
svs_start_time = time.time()
|
83 |
singing_audio, sample_rate = self.svs.synthesize(
|
84 |
+
score, language=language, speaker=speaker
|
85 |
)
|
86 |
if self.track_latency:
|
87 |
svs_end_time = time.time()
|
|
|
99 |
})
|
100 |
return results
|
101 |
|
102 |
+
def evaluate(self, audio_path):
|
103 |
+
return run_evaluation(audio_path, self.evaluators)
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
git+https://github.com/espnet/espnet.git@3856d998ee0b2fa20f7b8fa48553754f33ed6e63
|
2 |
espnet_model_zoo
|
3 |
-
|
4 |
datasets
|
5 |
torchaudio
|
6 |
typeguard==4.4.0
|
@@ -15,3 +15,6 @@ transformers
|
|
15 |
s3prl
|
16 |
zhconv
|
17 |
git+https://github.com/sea-turt1e/kanjiconv
|
|
|
|
|
|
|
|
1 |
git+https://github.com/espnet/espnet.git@3856d998ee0b2fa20f7b8fa48553754f33ed6e63
|
2 |
espnet_model_zoo
|
3 |
+
pyopenjtalk
|
4 |
datasets
|
5 |
torchaudio
|
6 |
typeguard==4.4.0
|
|
|
15 |
s3prl
|
16 |
zhconv
|
17 |
git+https://github.com/sea-turt1e/kanjiconv
|
18 |
+
soundfile
|
19 |
+
PyYAML
|
20 |
+
gradio
|
tests/audio/chat.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:181a7f27f8acb00cba0276d0ff88759120a76eebd47b4e0a60c2424e43e5cbaf
|
3 |
+
size 271030
|
tests/audio/feeling.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fef036c2bf0ddf635a004845e94c89d0658f754a53e12fadbb50511d3cd6c15
|
3 |
+
size 263502
|
tests/audio/hello.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa7e839d32f7bda77cad11fc13fd1b92df939479612dd5af079d8f9b19598c0d
|
3 |
+
size 263502
|
tests/audio/interesting.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a1618f73d90ad068d5eb72455ac812b49fcb9e44e88af5e67ef88f5c6ddb74a
|
3 |
+
size 429086
|
tests/audio/music.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6388b587e282e8f6457b629b5cbb9fd50c5cb6a7f90c446329a3f23be8b1442c
|
3 |
+
size 286082
|
tests/audio/where_from.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ef81772b96813216d7b14d3d70a39b040e9c542d896d9337f8975f8fd6da96e
|
3 |
+
size 195766
|