Spaces:
Sleeping
Sleeping
Rename timbre to voice; add speaker parameters to SVS modules
Browse files- characters/Limei.py +1 -1
- characters/Yaoyin.py +1 -1
- characters/base.py +1 -1
- interface.py +25 -27
- modules/svs/base.py +2 -0
- modules/svs/espnet.py +3 -3
- pipeline.py +2 -2
characters/Limei.py
CHANGED
@@ -5,7 +5,7 @@ def get_character():
|
|
5 |
return Character(
|
6 |
name="Limei (丽梅)",
|
7 |
image_path="assets/character_limei.png",
|
8 |
-
|
9 |
prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
|
11 |
|
|
|
5 |
return Character(
|
6 |
name="Limei (丽梅)",
|
7 |
image_path="assets/character_limei.png",
|
8 |
+
default_voice="voice1",
|
9 |
prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
|
11 |
|
characters/Yaoyin.py
CHANGED
@@ -5,7 +5,7 @@ def get_character():
|
|
5 |
return Character(
|
6 |
name="Yaoyin (遥音)",
|
7 |
image_path="assets/character_yaoyin.jpg",
|
8 |
-
|
9 |
prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
|
11 |
|
|
|
5 |
return Character(
|
6 |
name="Yaoyin (遥音)",
|
7 |
image_path="assets/character_yaoyin.jpg",
|
8 |
+
default_voice="voice2",
|
9 |
prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
|
11 |
|
characters/base.py
CHANGED
@@ -5,5 +5,5 @@ from dataclasses import dataclass
|
|
5 |
class Character:
|
6 |
name: str
|
7 |
image_path: str
|
8 |
-
|
9 |
prompt: str
|
|
|
5 |
class Character:
|
6 |
name: str
|
7 |
image_path: str
|
8 |
+
default_voice: str
|
9 |
prompt: str
|
interface.py
CHANGED
@@ -17,8 +17,8 @@ class GradioInterface:
|
|
17 |
self.current_svs_model = (
|
18 |
f"{self.default_config['language']}-{self.default_config['svs_model']}"
|
19 |
)
|
20 |
-
self.
|
21 |
-
self.character_info[self.current_character].
|
22 |
]
|
23 |
self.pipeline = SingingDialoguePipeline(self.default_config)
|
24 |
|
@@ -104,21 +104,21 @@ class GradioInterface:
|
|
104 |
value=self.current_svs_model,
|
105 |
)
|
106 |
with gr.Row():
|
107 |
-
|
108 |
-
label="Singing
|
109 |
choices=list(
|
110 |
self.svs_model_map[self.current_svs_model][
|
111 |
-
"
|
112 |
].keys()
|
113 |
),
|
114 |
value=self.character_info[
|
115 |
self.current_character
|
116 |
-
].
|
117 |
)
|
118 |
character_radio.change(
|
119 |
fn=self.update_character,
|
120 |
inputs=character_radio,
|
121 |
-
outputs=[character_image,
|
122 |
)
|
123 |
asr_radio.change(
|
124 |
fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
|
@@ -129,15 +129,15 @@ class GradioInterface:
|
|
129 |
svs_radio.change(
|
130 |
fn=self.update_svs_model,
|
131 |
inputs=svs_radio,
|
132 |
-
outputs=[svs_radio,
|
133 |
)
|
134 |
melody_radio.change(
|
135 |
fn=self.update_melody_source,
|
136 |
inputs=melody_radio,
|
137 |
outputs=melody_radio,
|
138 |
)
|
139 |
-
|
140 |
-
fn=self.
|
141 |
)
|
142 |
mic_input.change(
|
143 |
fn=self.run_pipeline,
|
@@ -152,12 +152,12 @@ class GradioInterface:
|
|
152 |
|
153 |
def update_character(self, character):
|
154 |
self.current_character = character
|
155 |
-
|
156 |
-
self.
|
157 |
-
|
158 |
]
|
159 |
return gr.update(value=self.character_info[character].image_path), gr.update(
|
160 |
-
value=
|
161 |
)
|
162 |
|
163 |
def update_asr_model(self, asr_model):
|
@@ -170,23 +170,23 @@ class GradioInterface:
|
|
170 |
|
171 |
def update_svs_model(self, svs_model):
|
172 |
self.current_svs_model = svs_model
|
173 |
-
|
174 |
-
self.
|
175 |
-
|
176 |
]
|
177 |
self.pipeline.set_svs_model(
|
178 |
self.svs_model_map[self.current_svs_model]["model_path"]
|
179 |
)
|
180 |
print(
|
181 |
-
f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and
|
182 |
)
|
183 |
return (
|
184 |
gr.update(value=svs_model),
|
185 |
gr.update(
|
186 |
choices=list(
|
187 |
-
self.svs_model_map[self.current_svs_model]["
|
188 |
),
|
189 |
-
value=
|
190 |
),
|
191 |
)
|
192 |
|
@@ -194,20 +194,18 @@ class GradioInterface:
|
|
194 |
self.current_melody_source = melody_source
|
195 |
return gr.update(value=self.current_melody_source)
|
196 |
|
197 |
-
def
|
198 |
-
self.
|
199 |
-
|
200 |
]
|
201 |
-
return gr.update(value=
|
202 |
|
203 |
def run_pipeline(self, audio_path):
|
204 |
results = self.pipeline.run(
|
205 |
audio_path,
|
206 |
self.svs_model_map[self.current_svs_model]["lang"],
|
207 |
self.character_info[self.current_character].prompt,
|
208 |
-
|
209 |
-
"speaker": self.current_timbre,
|
210 |
-
},
|
211 |
max_new_tokens=100,
|
212 |
)
|
213 |
formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
|
|
|
17 |
self.current_svs_model = (
|
18 |
f"{self.default_config['language']}-{self.default_config['svs_model']}"
|
19 |
)
|
20 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
|
21 |
+
self.character_info[self.current_character].default_voice
|
22 |
]
|
23 |
self.pipeline = SingingDialoguePipeline(self.default_config)
|
24 |
|
|
|
104 |
value=self.current_svs_model,
|
105 |
)
|
106 |
with gr.Row():
|
107 |
+
voice_radio = gr.Radio(
|
108 |
+
label="Singing voice",
|
109 |
choices=list(
|
110 |
self.svs_model_map[self.current_svs_model][
|
111 |
+
"voices"
|
112 |
].keys()
|
113 |
),
|
114 |
value=self.character_info[
|
115 |
self.current_character
|
116 |
+
].default_voice,
|
117 |
)
|
118 |
character_radio.change(
|
119 |
fn=self.update_character,
|
120 |
inputs=character_radio,
|
121 |
+
outputs=[character_image, voice_radio],
|
122 |
)
|
123 |
asr_radio.change(
|
124 |
fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
|
|
|
129 |
svs_radio.change(
|
130 |
fn=self.update_svs_model,
|
131 |
inputs=svs_radio,
|
132 |
+
outputs=[svs_radio, voice_radio],
|
133 |
)
|
134 |
melody_radio.change(
|
135 |
fn=self.update_melody_source,
|
136 |
inputs=melody_radio,
|
137 |
outputs=melody_radio,
|
138 |
)
|
139 |
+
voice_radio.change(
|
140 |
+
fn=self.update_voice, inputs=voice_radio, outputs=voice_radio
|
141 |
)
|
142 |
mic_input.change(
|
143 |
fn=self.run_pipeline,
|
|
|
152 |
|
153 |
def update_character(self, character):
|
154 |
self.current_character = character
|
155 |
+
character_voice = self.character_info[self.current_character].default_voice
|
156 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
|
157 |
+
character_voice
|
158 |
]
|
159 |
return gr.update(value=self.character_info[character].image_path), gr.update(
|
160 |
+
value=character_voice
|
161 |
)
|
162 |
|
163 |
def update_asr_model(self, asr_model):
|
|
|
170 |
|
171 |
def update_svs_model(self, svs_model):
|
172 |
self.current_svs_model = svs_model
|
173 |
+
character_voice = self.character_info[self.current_character].default_voice
|
174 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
|
175 |
+
character_voice
|
176 |
]
|
177 |
self.pipeline.set_svs_model(
|
178 |
self.svs_model_map[self.current_svs_model]["model_path"]
|
179 |
)
|
180 |
print(
|
181 |
+
f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and voice_radio to {character_voice}"
|
182 |
)
|
183 |
return (
|
184 |
gr.update(value=svs_model),
|
185 |
gr.update(
|
186 |
choices=list(
|
187 |
+
self.svs_model_map[self.current_svs_model]["voices"].keys()
|
188 |
),
|
189 |
+
value=character_voice,
|
190 |
),
|
191 |
)
|
192 |
|
|
|
194 |
self.current_melody_source = melody_source
|
195 |
return gr.update(value=self.current_melody_source)
|
196 |
|
197 |
+
def update_voice(self, voice):
|
198 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
|
199 |
+
voice
|
200 |
]
|
201 |
+
return gr.update(value=voice)
|
202 |
|
203 |
def run_pipeline(self, audio_path):
|
204 |
results = self.pipeline.run(
|
205 |
audio_path,
|
206 |
self.svs_model_map[self.current_svs_model]["lang"],
|
207 |
self.character_info[self.current_character].prompt,
|
208 |
+
self.current_voice,
|
|
|
|
|
209 |
max_new_tokens=100,
|
210 |
)
|
211 |
formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
|
modules/svs/base.py
CHANGED
@@ -13,6 +13,8 @@ class AbstractSVSModel(ABC):
|
|
13 |
def synthesize(
|
14 |
self,
|
15 |
score: list[tuple[float, float, str, int]],
|
|
|
|
|
16 |
**kwargs,
|
17 |
) -> tuple[np.ndarray, int]:
|
18 |
"""
|
|
|
13 |
def synthesize(
|
14 |
self,
|
15 |
score: list[tuple[float, float, str, int]],
|
16 |
+
language: str,
|
17 |
+
speaker: str,
|
18 |
**kwargs,
|
19 |
) -> tuple[np.ndarray, int]:
|
20 |
"""
|
modules/svs/espnet.py
CHANGED
@@ -99,11 +99,11 @@ class ESPNetSVS(AbstractSVSModel):
|
|
99 |
return batch
|
100 |
|
101 |
def synthesize(
|
102 |
-
self, score: list[tuple[float, float, str, int]], language: str, **kwargs
|
103 |
):
|
104 |
batch = self._preprocess(score, language)
|
105 |
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
106 |
-
sid = np.array([int(
|
107 |
output_dict = self.model(batch, sids=sid)
|
108 |
elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
|
109 |
langs = {
|
@@ -115,7 +115,7 @@ class ESPNetSVS(AbstractSVSModel):
|
|
115 |
f"Unsupported language: {language} for {self.model_id}"
|
116 |
)
|
117 |
lid = np.array([langs[language]])
|
118 |
-
spk_embed = np.load(
|
119 |
output_dict = self.model(batch, lids=lid, spembs=spk_embed)
|
120 |
else:
|
121 |
raise NotImplementedError(f"Model {self.model_id} not supported")
|
|
|
99 |
return batch
|
100 |
|
101 |
def synthesize(
|
102 |
+
self, score: list[tuple[float, float, str, int]], language: str, speaker: str, **kwargs
|
103 |
):
|
104 |
batch = self._preprocess(score, language)
|
105 |
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
106 |
+
sid = np.array([int(speaker)])
|
107 |
output_dict = self.model(batch, sids=sid)
|
108 |
elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
|
109 |
langs = {
|
|
|
115 |
f"Unsupported language: {language} for {self.model_id}"
|
116 |
)
|
117 |
lid = np.array([langs[language]])
|
118 |
+
spk_embed = np.load(speaker)
|
119 |
output_dict = self.model(batch, lids=lid, spembs=spk_embed)
|
120 |
else:
|
121 |
raise NotImplementedError(f"Model {self.model_id} not supported")
|
pipeline.py
CHANGED
@@ -55,7 +55,7 @@ class SingingDialoguePipeline:
|
|
55 |
audio_path,
|
56 |
language,
|
57 |
prompt_template,
|
58 |
-
|
59 |
max_new_tokens=100,
|
60 |
):
|
61 |
if self.track_latency:
|
@@ -81,7 +81,7 @@ class SingingDialoguePipeline:
|
|
81 |
if self.track_latency:
|
82 |
svs_start_time = time.time()
|
83 |
singing_audio, sample_rate = self.svs.synthesize(
|
84 |
-
score, language=language,
|
85 |
)
|
86 |
if self.track_latency:
|
87 |
svs_end_time = time.time()
|
|
|
55 |
audio_path,
|
56 |
language,
|
57 |
prompt_template,
|
58 |
+
speaker,
|
59 |
max_new_tokens=100,
|
60 |
):
|
61 |
if self.track_latency:
|
|
|
81 |
if self.track_latency:
|
82 |
svs_start_time = time.time()
|
83 |
singing_audio, sample_rate = self.svs.synthesize(
|
84 |
+
score, language=language, speaker=speaker
|
85 |
)
|
86 |
if self.track_latency:
|
87 |
svs_end_time = time.time()
|