jhansss commited on
Commit
f1b8d35
·
1 Parent(s): 6983b01

Rename timbre to voice; add speaker parameters to SVS modules

Browse files
characters/Limei.py CHANGED
@@ -5,7 +5,7 @@ def get_character():
5
  return Character(
6
  name="Limei (丽梅)",
7
  image_path="assets/character_limei.png",
8
- default_timbre="timbre1",
9
  prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
11
 
 
5
  return Character(
6
  name="Limei (丽梅)",
7
  image_path="assets/character_limei.png",
8
+ default_voice="voice1",
9
  prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
11
 
characters/Yaoyin.py CHANGED
@@ -5,7 +5,7 @@ def get_character():
5
  return Character(
6
  name="Yaoyin (遥音)",
7
  image_path="assets/character_yaoyin.jpg",
8
- default_timbre="timbre2",
9
  prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
11
 
 
5
  return Character(
6
  name="Yaoyin (遥音)",
7
  image_path="assets/character_yaoyin.jpg",
8
+ default_voice="voice2",
9
  prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
11
 
characters/base.py CHANGED
@@ -5,5 +5,5 @@ from dataclasses import dataclass
5
  class Character:
6
  name: str
7
  image_path: str
8
- default_timbre: str
9
  prompt: str
 
5
  class Character:
6
  name: str
7
  image_path: str
8
+ default_voice: str
9
  prompt: str
interface.py CHANGED
@@ -17,8 +17,8 @@ class GradioInterface:
17
  self.current_svs_model = (
18
  f"{self.default_config['language']}-{self.default_config['svs_model']}"
19
  )
20
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
21
- self.character_info[self.current_character].default_timbre
22
  ]
23
  self.pipeline = SingingDialoguePipeline(self.default_config)
24
 
@@ -104,21 +104,21 @@ class GradioInterface:
104
  value=self.current_svs_model,
105
  )
106
  with gr.Row():
107
- timbre_radio = gr.Radio(
108
- label="Singing Timbre",
109
  choices=list(
110
  self.svs_model_map[self.current_svs_model][
111
- "embeddings"
112
  ].keys()
113
  ),
114
  value=self.character_info[
115
  self.current_character
116
- ].default_timbre,
117
  )
118
  character_radio.change(
119
  fn=self.update_character,
120
  inputs=character_radio,
121
- outputs=[character_image, timbre_radio],
122
  )
123
  asr_radio.change(
124
  fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
@@ -129,15 +129,15 @@ class GradioInterface:
129
  svs_radio.change(
130
  fn=self.update_svs_model,
131
  inputs=svs_radio,
132
- outputs=[svs_radio, timbre_radio],
133
  )
134
  melody_radio.change(
135
  fn=self.update_melody_source,
136
  inputs=melody_radio,
137
  outputs=melody_radio,
138
  )
139
- timbre_radio.change(
140
- fn=self.update_timbre, inputs=timbre_radio, outputs=timbre_radio
141
  )
142
  mic_input.change(
143
  fn=self.run_pipeline,
@@ -152,12 +152,12 @@ class GradioInterface:
152
 
153
  def update_character(self, character):
154
  self.current_character = character
155
- character_timbre = self.character_info[self.current_character].default_timbre
156
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
157
- character_timbre
158
  ]
159
  return gr.update(value=self.character_info[character].image_path), gr.update(
160
- value=character_timbre
161
  )
162
 
163
  def update_asr_model(self, asr_model):
@@ -170,23 +170,23 @@ class GradioInterface:
170
 
171
  def update_svs_model(self, svs_model):
172
  self.current_svs_model = svs_model
173
- character_timbre = self.character_info[self.current_character].default_timbre
174
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
175
- character_timbre
176
  ]
177
  self.pipeline.set_svs_model(
178
  self.svs_model_map[self.current_svs_model]["model_path"]
179
  )
180
  print(
181
- f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and timbre_radio to {character_timbre}"
182
  )
183
  return (
184
  gr.update(value=svs_model),
185
  gr.update(
186
  choices=list(
187
- self.svs_model_map[self.current_svs_model]["embeddings"].keys()
188
  ),
189
- value=character_timbre,
190
  ),
191
  )
192
 
@@ -194,20 +194,18 @@ class GradioInterface:
194
  self.current_melody_source = melody_source
195
  return gr.update(value=self.current_melody_source)
196
 
197
- def update_timbre(self, timbre):
198
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
199
- timbre
200
  ]
201
- return gr.update(value=timbre)
202
 
203
  def run_pipeline(self, audio_path):
204
  results = self.pipeline.run(
205
  audio_path,
206
  self.svs_model_map[self.current_svs_model]["lang"],
207
  self.character_info[self.current_character].prompt,
208
- svs_inference_kwargs={
209
- "speaker": self.current_timbre,
210
- },
211
  max_new_tokens=100,
212
  )
213
  formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
 
17
  self.current_svs_model = (
18
  f"{self.default_config['language']}-{self.default_config['svs_model']}"
19
  )
20
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
21
+ self.character_info[self.current_character].default_voice
22
  ]
23
  self.pipeline = SingingDialoguePipeline(self.default_config)
24
 
 
104
  value=self.current_svs_model,
105
  )
106
  with gr.Row():
107
+ voice_radio = gr.Radio(
108
+ label="Singing voice",
109
  choices=list(
110
  self.svs_model_map[self.current_svs_model][
111
+ "voices"
112
  ].keys()
113
  ),
114
  value=self.character_info[
115
  self.current_character
116
+ ].default_voice,
117
  )
118
  character_radio.change(
119
  fn=self.update_character,
120
  inputs=character_radio,
121
+ outputs=[character_image, voice_radio],
122
  )
123
  asr_radio.change(
124
  fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
 
129
  svs_radio.change(
130
  fn=self.update_svs_model,
131
  inputs=svs_radio,
132
+ outputs=[svs_radio, voice_radio],
133
  )
134
  melody_radio.change(
135
  fn=self.update_melody_source,
136
  inputs=melody_radio,
137
  outputs=melody_radio,
138
  )
139
+ voice_radio.change(
140
+ fn=self.update_voice, inputs=voice_radio, outputs=voice_radio
141
  )
142
  mic_input.change(
143
  fn=self.run_pipeline,
 
152
 
153
  def update_character(self, character):
154
  self.current_character = character
155
+ character_voice = self.character_info[self.current_character].default_voice
156
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
157
+ character_voice
158
  ]
159
  return gr.update(value=self.character_info[character].image_path), gr.update(
160
+ value=character_voice
161
  )
162
 
163
  def update_asr_model(self, asr_model):
 
170
 
171
  def update_svs_model(self, svs_model):
172
  self.current_svs_model = svs_model
173
+ character_voice = self.character_info[self.current_character].default_voice
174
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
175
+ character_voice
176
  ]
177
  self.pipeline.set_svs_model(
178
  self.svs_model_map[self.current_svs_model]["model_path"]
179
  )
180
  print(
181
+ f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and voice_radio to {character_voice}"
182
  )
183
  return (
184
  gr.update(value=svs_model),
185
  gr.update(
186
  choices=list(
187
+ self.svs_model_map[self.current_svs_model]["voices"].keys()
188
  ),
189
+ value=character_voice,
190
  ),
191
  )
192
 
 
194
  self.current_melody_source = melody_source
195
  return gr.update(value=self.current_melody_source)
196
 
197
+ def update_voice(self, voice):
198
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
199
+ voice
200
  ]
201
+ return gr.update(value=voice)
202
 
203
  def run_pipeline(self, audio_path):
204
  results = self.pipeline.run(
205
  audio_path,
206
  self.svs_model_map[self.current_svs_model]["lang"],
207
  self.character_info[self.current_character].prompt,
208
+ self.current_voice,
 
 
209
  max_new_tokens=100,
210
  )
211
  formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
modules/svs/base.py CHANGED
@@ -13,6 +13,8 @@ class AbstractSVSModel(ABC):
13
  def synthesize(
14
  self,
15
  score: list[tuple[float, float, str, int]],
 
 
16
  **kwargs,
17
  ) -> tuple[np.ndarray, int]:
18
  """
 
13
  def synthesize(
14
  self,
15
  score: list[tuple[float, float, str, int]],
16
+ language: str,
17
+ speaker: str,
18
  **kwargs,
19
  ) -> tuple[np.ndarray, int]:
20
  """
modules/svs/espnet.py CHANGED
@@ -99,11 +99,11 @@ class ESPNetSVS(AbstractSVSModel):
99
  return batch
100
 
101
  def synthesize(
102
- self, score: list[tuple[float, float, str, int]], language: str, **kwargs
103
  ):
104
  batch = self._preprocess(score, language)
105
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
106
- sid = np.array([int(kwargs["speaker"])])
107
  output_dict = self.model(batch, sids=sid)
108
  elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
109
  langs = {
@@ -115,7 +115,7 @@ class ESPNetSVS(AbstractSVSModel):
115
  f"Unsupported language: {language} for {self.model_id}"
116
  )
117
  lid = np.array([langs[language]])
118
- spk_embed = np.load(kwargs["speaker"])
119
  output_dict = self.model(batch, lids=lid, spembs=spk_embed)
120
  else:
121
  raise NotImplementedError(f"Model {self.model_id} not supported")
 
99
  return batch
100
 
101
  def synthesize(
102
+ self, score: list[tuple[float, float, str, int]], language: str, speaker: str, **kwargs
103
  ):
104
  batch = self._preprocess(score, language)
105
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
106
+ sid = np.array([int(speaker)])
107
  output_dict = self.model(batch, sids=sid)
108
  elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
109
  langs = {
 
115
  f"Unsupported language: {language} for {self.model_id}"
116
  )
117
  lid = np.array([langs[language]])
118
+ spk_embed = np.load(speaker)
119
  output_dict = self.model(batch, lids=lid, spembs=spk_embed)
120
  else:
121
  raise NotImplementedError(f"Model {self.model_id} not supported")
pipeline.py CHANGED
@@ -55,7 +55,7 @@ class SingingDialoguePipeline:
55
  audio_path,
56
  language,
57
  prompt_template,
58
- svs_inference_kwargs,
59
  max_new_tokens=100,
60
  ):
61
  if self.track_latency:
@@ -81,7 +81,7 @@ class SingingDialoguePipeline:
81
  if self.track_latency:
82
  svs_start_time = time.time()
83
  singing_audio, sample_rate = self.svs.synthesize(
84
- score, language=language, **svs_inference_kwargs
85
  )
86
  if self.track_latency:
87
  svs_end_time = time.time()
 
55
  audio_path,
56
  language,
57
  prompt_template,
58
+ speaker,
59
  max_new_tokens=100,
60
  ):
61
  if self.track_latency:
 
81
  if self.track_latency:
82
  svs_start_time = time.time()
83
  singing_audio, sample_rate = self.svs.synthesize(
84
+ score, language=language, speaker=speaker
85
  )
86
  if self.track_latency:
87
  svs_end_time = time.time()