jhansss commited on
Commit
e642717
·
2 Parent(s): 9c3a8d3 09fa5bf

Merge branch 'refactor' into hf

Browse files
.gitattributes CHANGED
@@ -1,2 +1,4 @@
1
  *.png filter=lfs diff=lfs merge=lfs -text
2
  *.jpg filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  *.png filter=lfs diff=lfs merge=lfs -text
2
  *.jpg filter=lfs diff=lfs merge=lfs -text
3
+ *.wav filter=lfs diff=lfs merge=lfs -text
4
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -9,7 +9,8 @@ from interface import GradioInterface
9
 
10
  def main():
11
  demo = GradioInterface(
12
- options_config="config/options.yaml", default_config="config/default.yaml"
 
13
  ).create_interface()
14
  demo.launch()
15
 
 
9
 
10
  def main():
11
  demo = GradioInterface(
12
+ options_config="config/interface/options.yaml",
13
+ default_config="config/interface/default.yaml",
14
  ).create_interface()
15
  demo.launch()
16
 
characters/Limei.py CHANGED
@@ -5,7 +5,7 @@ def get_character():
5
  return Character(
6
  name="Limei (丽梅)",
7
  image_path="assets/character_limei.png",
8
- default_timbre="timbre1",
9
  prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
11
 
 
5
  return Character(
6
  name="Limei (丽梅)",
7
  image_path="assets/character_limei.png",
8
+ default_voice="voice1",
9
  prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
11
 
characters/Yaoyin.py CHANGED
@@ -5,7 +5,7 @@ def get_character():
5
  return Character(
6
  name="Yaoyin (遥音)",
7
  image_path="assets/character_yaoyin.jpg",
8
- default_timbre="timbre2",
9
  prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
11
 
 
5
  return Character(
6
  name="Yaoyin (遥音)",
7
  image_path="assets/character_yaoyin.jpg",
8
+ default_voice="voice2",
9
  prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
11
 
characters/base.py CHANGED
@@ -5,5 +5,5 @@ from dataclasses import dataclass
5
  class Character:
6
  name: str
7
  image_path: str
8
- default_timbre: str
9
  prompt: str
 
5
  class Character:
6
  name: str
7
  image_path: str
8
+ default_voice: str
9
  prompt: str
cli.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from argparse import ArgumentParser
2
+ from logging import getLogger
3
+
4
+ import soundfile as sf
5
+ import yaml
6
+
7
+ from characters import CHARACTERS
8
+ from pipeline import SingingDialoguePipeline
9
+
10
+ logger = getLogger(__name__)
11
+
12
+
13
+ def get_parser():
14
+ parser = ArgumentParser()
15
+ parser.add_argument("--query_audio", type=str, required=True)
16
+ parser.add_argument("--config_path", type=str, default="config/cli/yaoyin_default.yaml")
17
+ parser.add_argument("--output_audio", type=str, required=True)
18
+ return parser
19
+
20
+
21
+ def load_config(config_path: str):
22
+ with open(config_path, "r") as f:
23
+ config = yaml.safe_load(f)
24
+ return config
25
+
26
+
27
+ def main():
28
+ parser = get_parser()
29
+ args = parser.parse_args()
30
+ config = load_config(args.config_path)
31
+ pipeline = SingingDialoguePipeline(config)
32
+ speaker = config["speaker"]
33
+ language = config["language"]
34
+ character_name = config["prompt_template_character"]
35
+ character = CHARACTERS[character_name]
36
+ prompt_template = character.prompt
37
+ results = pipeline.run(args.query_audio, language, prompt_template, speaker)
38
+ logger.info(
39
+ f"Input: {args.query_audio}, Output: {args.output_audio}, ASR results: {results['asr_text']}, LLM results: {results['llm_text']}"
40
+ )
41
+ svs_audio, svs_sample_rate = results["svs_audio"]
42
+ sf.write(args.output_audio, svs_audio, svs_sample_rate)
43
+
44
+
45
+ if __name__ == "__main__":
46
+ main()
config/cli/limei_default.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_model: openai/whisper-large-v3-turbo
2
+ llm_model: google/gemma-2-2b
3
+ svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
+ melody_source: sample-lyric-kising
5
+ language: mandarin
6
+ prompt_template_character: Limei
7
+ speaker: 5
8
+ cache_dir: .cache
9
+
10
+ track_latency: True
11
+ evaluators:
12
+ svs:
13
+ - singmos
14
+ - per
15
+ - melody
16
+ - aesthetic
config/cli/yaoyin_default.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_model: openai/whisper-large-v3-turbo
2
+ llm_model: google/gemma-2-2b
3
+ svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
+ melody_source: sample-lyric-kising
5
+ language: mandarin
6
+ prompt_template_character: Yaoyin
7
+ speaker: 9
8
+ cache_dir: .cache
9
+
10
+ track_latency: True
11
+ evaluators:
12
+ svs:
13
+ - singmos
14
+ - per
15
+ - melody
16
+ - aesthetic
config/{default.yaml → interface/default.yaml} RENAMED
File without changes
config/interface/options.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_models:
2
+ - id: openai/whisper-large-v3-turbo
3
+ name: Whisper large-v3-turbo
4
+ - id: openai/whisper-large-v3
5
+ name: Whisper large-v3
6
+ - id: openai/whisper-medium
7
+ name: Whisper medium
8
+ - id: sanchit-gandhi/whisper-small-dv
9
+ name: Whisper small-dv
10
+ - id: facebook/wav2vec2-base-960h
11
+ name: Wav2Vec2-Base-960h
12
+
13
+ llm_models:
14
+ - id: google/gemma-2-2b
15
+ name: Gemma 2 2B
16
+ - id: MiniMaxAI/MiniMax-M1-80k
17
+ name: MiniMax M1 80k
18
+
19
+ svs_models:
20
+ - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
21
+ name: Visinger2 (Bilingual)-zh
22
+ model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
23
+ lang: mandarin
24
+ voices:
25
+ voice1: resource/singer/singer_embedding_ace-2.npy
26
+ voice2: resource/singer/singer_embedding_ace-8.npy
27
+ voice3: resource/singer/singer_embedding_itako.npy
28
+ voice4: resource/singer/singer_embedding_kising_orange.npy
29
+ voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
30
+ - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
31
+ name: Visinger2 (Bilingual)-jp
32
+ model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
33
+ lang: japanese
34
+ voices:
35
+ voice1: resource/singer/singer_embedding_ace-2.npy
36
+ voice2: resource/singer/singer_embedding_ace-8.npy
37
+ voice3: resource/singer/singer_embedding_itako.npy
38
+ voice4: resource/singer/singer_embedding_kising_orange.npy
39
+ voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
40
+ - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
41
+ name: Visinger2 (Chinese)
42
+ model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
43
+ lang: mandarin
44
+ voices:
45
+ voice1: 5
46
+ voice2: 8
47
+ voice3: 12
48
+ voice4: 15
49
+ voice5: 29
50
+
51
+ melody_sources:
52
+ - id: gen-random-none
53
+ name: Random Generation
54
+ desc: "Melody is generated without any structure or reference."
55
+ - id: sample-note-kising
56
+ name: Sampled Melody (KiSing)
57
+ desc: "Melody is retrieved from KiSing dataset."
58
+ - id: sample-note-touhou
59
+ name: Sampled Melody (Touhou)
60
+ desc: "Melody is retrieved from Touhou dataset."
61
+ - id: sample-lyric-kising
62
+ name: Sampled Melody with Lyrics (Kising)
63
+ desc: "Melody with aligned lyrics are sampled from Kising dataset."
config/options.yaml CHANGED
@@ -15,6 +15,8 @@ llm_models:
15
  name: Gemma 2 2B
16
  - id: MiniMaxAI/MiniMax-M1-80k
17
  name: MiniMax M1 80k
 
 
18
 
19
  svs_models:
20
  - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
 
15
  name: Gemma 2 2B
16
  - id: MiniMaxAI/MiniMax-M1-80k
17
  name: MiniMax M1 80k
18
+ - id: meta-llama/Llama-3.2-3B-Instruct
19
+ name: Llama 3.2 3B Instruct
20
 
21
  svs_models:
22
  - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
evaluation/svs_eval.py CHANGED
@@ -37,7 +37,8 @@ def init_audiobox_aesthetics():
37
  # ----------- Evaluation -----------
38
 
39
 
40
- def eval_singmos(audio_array, sr, predictor):
 
41
  wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
42
  wav_tensor = torch.from_numpy(wav).unsqueeze(0)
43
  length_tensor = torch.tensor([wav_tensor.shape[1]])
@@ -71,7 +72,8 @@ def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
71
  return np.mean(dissonant) if intervals else np.nan
72
 
73
 
74
- def eval_per(audio_array, sr, model=None):
 
75
  # TODO: implement PER evaluation
76
  return {}
77
 
@@ -97,20 +99,16 @@ def load_evaluators(config):
97
  return loaded
98
 
99
 
100
- def run_evaluation(audio_array, sr, evaluators):
101
  results = {}
102
  if "singmos" in evaluators:
103
- results.update(eval_singmos(audio_array, sr, evaluators["singmos"]))
104
  if "per" in evaluators:
105
- results.update(eval_per(audio_array, sr, evaluators["per"]))
106
- # create a tmp file with unique name
107
- tmp_path = Path(".tmp") / f"{uuid.uuid4()}.wav"
108
- sf.write(tmp_path, audio_array, sr)
109
  if "melody" in evaluators:
110
- results.update(eval_melody_metrics(tmp_path, evaluators["melody"]))
111
  if "aesthetic" in evaluators:
112
- results.update(eval_aesthetic(tmp_path, evaluators["aesthetic"]))
113
- tmp_path.unlink()
114
  return results
115
 
116
 
@@ -122,9 +120,8 @@ if __name__ == "__main__":
122
  parser.add_argument("--results_csv", type=str, required=True)
123
  parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
124
  args = parser.parse_args()
125
- audio_array, sr = librosa.load(args.wav_path, sr=None)
126
  evaluators = load_evaluators(args.evaluators.split(","))
127
- results = run_evaluation(audio_array, sr, evaluators)
128
  print(results)
129
 
130
  with open(args.results_csv, "a") as f:
 
37
  # ----------- Evaluation -----------
38
 
39
 
40
+ def eval_singmos(audio_path, predictor):
41
+ audio_array, sr = librosa.load(audio_path, sr=44100)
42
  wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
43
  wav_tensor = torch.from_numpy(wav).unsqueeze(0)
44
  length_tensor = torch.tensor([wav_tensor.shape[1]])
 
72
  return np.mean(dissonant) if intervals else np.nan
73
 
74
 
75
+ def eval_per(audio_path, model=None):
76
+ audio_array, sr = librosa.load(audio_path, sr=16000)
77
  # TODO: implement PER evaluation
78
  return {}
79
 
 
99
  return loaded
100
 
101
 
102
+ def run_evaluation(audio_path, evaluators):
103
  results = {}
104
  if "singmos" in evaluators:
105
+ results.update(eval_singmos(audio_path, evaluators["singmos"]))
106
  if "per" in evaluators:
107
+ results.update(eval_per(audio_path, evaluators["per"]))
 
 
 
108
  if "melody" in evaluators:
109
+ results.update(eval_melody_metrics(audio_path, evaluators["melody"]))
110
  if "aesthetic" in evaluators:
111
+ results.update(eval_aesthetic(audio_path, evaluators["aesthetic"]))
 
112
  return results
113
 
114
 
 
120
  parser.add_argument("--results_csv", type=str, required=True)
121
  parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
122
  args = parser.parse_args()
 
123
  evaluators = load_evaluators(args.evaluators.split(","))
124
+ results = run_evaluation(args.wav_path, evaluators)
125
  print(results)
126
 
127
  with open(args.results_csv, "a") as f:
interface.py CHANGED
@@ -17,8 +17,8 @@ class GradioInterface:
17
  self.current_svs_model = (
18
  f"{self.default_config['language']}-{self.default_config['svs_model']}"
19
  )
20
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
21
- self.character_info[self.current_character].default_timbre
22
  ]
23
  self.pipeline = SingingDialoguePipeline(self.default_config)
24
 
@@ -104,21 +104,21 @@ class GradioInterface:
104
  value=self.current_svs_model,
105
  )
106
  with gr.Row():
107
- timbre_radio = gr.Radio(
108
- label="Singing Timbre",
109
  choices=list(
110
  self.svs_model_map[self.current_svs_model][
111
- "embeddings"
112
  ].keys()
113
  ),
114
  value=self.character_info[
115
  self.current_character
116
- ].default_timbre,
117
  )
118
  character_radio.change(
119
  fn=self.update_character,
120
  inputs=character_radio,
121
- outputs=[character_image, timbre_radio],
122
  )
123
  asr_radio.change(
124
  fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
@@ -129,35 +129,41 @@ class GradioInterface:
129
  svs_radio.change(
130
  fn=self.update_svs_model,
131
  inputs=svs_radio,
132
- outputs=[svs_radio, timbre_radio],
133
  )
134
  melody_radio.change(
135
  fn=self.update_melody_source,
136
  inputs=melody_radio,
137
  outputs=melody_radio,
138
  )
139
- timbre_radio.change(
140
- fn=self.update_timbre, inputs=timbre_radio, outputs=timbre_radio
141
  )
142
  mic_input.change(
143
  fn=self.run_pipeline,
144
  inputs=mic_input,
145
  outputs=[interaction_log, audio_output],
146
  )
 
 
 
 
 
147
 
148
  return demo
149
  except Exception as e:
150
  print(f"error: {e}")
151
  breakpoint()
 
152
 
153
  def update_character(self, character):
154
  self.current_character = character
155
- character_timbre = self.character_info[self.current_character].default_timbre
156
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
157
- character_timbre
158
  ]
159
  return gr.update(value=self.character_info[character].image_path), gr.update(
160
- value=character_timbre
161
  )
162
 
163
  def update_asr_model(self, asr_model):
@@ -170,23 +176,23 @@ class GradioInterface:
170
 
171
  def update_svs_model(self, svs_model):
172
  self.current_svs_model = svs_model
173
- character_timbre = self.character_info[self.current_character].default_timbre
174
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
175
- character_timbre
176
  ]
177
  self.pipeline.set_svs_model(
178
  self.svs_model_map[self.current_svs_model]["model_path"]
179
  )
180
  print(
181
- f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and timbre_radio to {character_timbre}"
182
  )
183
  return (
184
  gr.update(value=svs_model),
185
  gr.update(
186
  choices=list(
187
- self.svs_model_map[self.current_svs_model]["embeddings"].keys()
188
  ),
189
- value=character_timbre,
190
  ),
191
  )
192
 
@@ -194,24 +200,30 @@ class GradioInterface:
194
  self.current_melody_source = melody_source
195
  return gr.update(value=self.current_melody_source)
196
 
197
- def update_timbre(self, timbre):
198
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
199
- timbre
200
  ]
201
- return gr.update(value=timbre)
202
 
203
  def run_pipeline(self, audio_path):
 
 
204
  results = self.pipeline.run(
205
  audio_path,
206
  self.svs_model_map[self.current_svs_model]["lang"],
207
  self.character_info[self.current_character].prompt,
208
- svs_inference_kwargs={
209
- "speaker": self.current_timbre,
210
- },
211
  max_new_tokens=100,
212
  )
213
  formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
214
  return gr.update(value=formatted_logs), gr.update(value=results["svs_audio"])
215
 
216
- def run_evaluation(self, audio, audio_sample_rate):
217
- pass
 
 
 
 
 
 
 
17
  self.current_svs_model = (
18
  f"{self.default_config['language']}-{self.default_config['svs_model']}"
19
  )
20
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
21
+ self.character_info[self.current_character].default_voice
22
  ]
23
  self.pipeline = SingingDialoguePipeline(self.default_config)
24
 
 
104
  value=self.current_svs_model,
105
  )
106
  with gr.Row():
107
+ voice_radio = gr.Radio(
108
+ label="Singing voice",
109
  choices=list(
110
  self.svs_model_map[self.current_svs_model][
111
+ "voices"
112
  ].keys()
113
  ),
114
  value=self.character_info[
115
  self.current_character
116
+ ].default_voice,
117
  )
118
  character_radio.change(
119
  fn=self.update_character,
120
  inputs=character_radio,
121
+ outputs=[character_image, voice_radio],
122
  )
123
  asr_radio.change(
124
  fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
 
129
  svs_radio.change(
130
  fn=self.update_svs_model,
131
  inputs=svs_radio,
132
+ outputs=[svs_radio, voice_radio],
133
  )
134
  melody_radio.change(
135
  fn=self.update_melody_source,
136
  inputs=melody_radio,
137
  outputs=melody_radio,
138
  )
139
+ voice_radio.change(
140
+ fn=self.update_voice, inputs=voice_radio, outputs=voice_radio
141
  )
142
  mic_input.change(
143
  fn=self.run_pipeline,
144
  inputs=mic_input,
145
  outputs=[interaction_log, audio_output],
146
  )
147
+ metrics_button.click(
148
+ fn=self.update_metrics,
149
+ inputs=audio_output,
150
+ outputs=[metrics_output],
151
+ )
152
 
153
  return demo
154
  except Exception as e:
155
  print(f"error: {e}")
156
  breakpoint()
157
+ return gr.Blocks()
158
 
159
  def update_character(self, character):
160
  self.current_character = character
161
+ character_voice = self.character_info[self.current_character].default_voice
162
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
163
+ character_voice
164
  ]
165
  return gr.update(value=self.character_info[character].image_path), gr.update(
166
+ value=character_voice
167
  )
168
 
169
  def update_asr_model(self, asr_model):
 
176
 
177
  def update_svs_model(self, svs_model):
178
  self.current_svs_model = svs_model
179
+ character_voice = self.character_info[self.current_character].default_voice
180
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
181
+ character_voice
182
  ]
183
  self.pipeline.set_svs_model(
184
  self.svs_model_map[self.current_svs_model]["model_path"]
185
  )
186
  print(
187
+ f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and voice_radio to {character_voice}"
188
  )
189
  return (
190
  gr.update(value=svs_model),
191
  gr.update(
192
  choices=list(
193
+ self.svs_model_map[self.current_svs_model]["voices"].keys()
194
  ),
195
+ value=character_voice,
196
  ),
197
  )
198
 
 
200
  self.current_melody_source = melody_source
201
  return gr.update(value=self.current_melody_source)
202
 
203
+ def update_voice(self, voice):
204
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
205
+ voice
206
  ]
207
+ return gr.update(value=voice)
208
 
209
  def run_pipeline(self, audio_path):
210
+ if not audio_path:
211
+ return gr.update(value=""), gr.update(value="")
212
  results = self.pipeline.run(
213
  audio_path,
214
  self.svs_model_map[self.current_svs_model]["lang"],
215
  self.character_info[self.current_character].prompt,
216
+ self.current_voice,
 
 
217
  max_new_tokens=100,
218
  )
219
  formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
220
  return gr.update(value=formatted_logs), gr.update(value=results["svs_audio"])
221
 
222
+ def update_metrics(self, audio_path):
223
+ if not audio_path:
224
+ return gr.update(value="")
225
+ results = self.pipeline.evaluate(audio_path)
226
+ formatted_metrics = "\n".join(
227
+ [f"{k}: {v}" for k, v in results.items()]
228
+ )
229
+ return gr.update(value=formatted_metrics)
modules/asr.py CHANGED
@@ -10,14 +10,13 @@ hf_token = os.getenv("HF_TOKEN")
10
 
11
 
12
  class AbstractASRModel(ABC):
13
- @abstractmethod
14
  def __init__(
15
  self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
16
  ):
 
17
  self.model_id = model_id
18
  self.device = device
19
  self.cache_dir = cache_dir
20
- pass
21
 
22
  @abstractmethod
23
  def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
 
10
 
11
 
12
  class AbstractASRModel(ABC):
 
13
  def __init__(
14
  self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
15
  ):
16
+ print(f"Loading ASR model {model_id}...")
17
  self.model_id = model_id
18
  self.device = device
19
  self.cache_dir = cache_dir
 
20
 
21
  @abstractmethod
22
  def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
modules/llm.py CHANGED
@@ -8,10 +8,13 @@ hf_token = os.getenv("HF_TOKEN")
8
 
9
 
10
  class AbstractLLMModel(ABC):
11
- @abstractmethod
12
  def __init__(
13
  self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
14
- ): ...
 
 
 
 
15
 
16
  @abstractmethod
17
  def generate(self, prompt: str, **kwargs) -> str:
@@ -41,6 +44,7 @@ class HFTextGenerationLLM(AbstractLLMModel):
41
  def __init__(
42
  self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
43
  ):
 
44
  model_kwargs = kwargs.setdefault("model_kwargs", {})
45
  model_kwargs["cache_dir"] = cache_dir
46
  self.pipe = pipeline(
 
8
 
9
 
10
  class AbstractLLMModel(ABC):
 
11
  def __init__(
12
  self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
13
+ ):
14
+ print(f"Loading LLM model {model_id}...")
15
+ self.model_id = model_id
16
+ self.device = device
17
+ self.cache_dir = cache_dir
18
 
19
  @abstractmethod
20
  def generate(self, prompt: str, **kwargs) -> str:
 
44
  def __init__(
45
  self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
46
  ):
47
+ super().__init__(model_id, device, cache_dir, **kwargs)
48
  model_kwargs = kwargs.setdefault("model_kwargs", {})
49
  model_kwargs["cache_dir"] = cache_dir
50
  self.pipe = pipeline(
modules/svs/base.py CHANGED
@@ -13,6 +13,8 @@ class AbstractSVSModel(ABC):
13
  def synthesize(
14
  self,
15
  score: list[tuple[float, float, str, int]],
 
 
16
  **kwargs,
17
  ) -> tuple[np.ndarray, int]:
18
  """
 
13
  def synthesize(
14
  self,
15
  score: list[tuple[float, float, str, int]],
16
+ language: str,
17
+ speaker: str,
18
  **kwargs,
19
  ) -> tuple[np.ndarray, int]:
20
  """
modules/svs/espnet.py CHANGED
@@ -99,11 +99,11 @@ class ESPNetSVS(AbstractSVSModel):
99
  return batch
100
 
101
  def synthesize(
102
- self, score: list[tuple[float, float, str, int]], language: str, **kwargs
103
  ):
104
  batch = self._preprocess(score, language)
105
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
106
- sid = np.array([int(kwargs["speaker"])])
107
  output_dict = self.model(batch, sids=sid)
108
  elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
109
  langs = {
@@ -115,7 +115,7 @@ class ESPNetSVS(AbstractSVSModel):
115
  f"Unsupported language: {language} for {self.model_id}"
116
  )
117
  lid = np.array([langs[language]])
118
- spk_embed = np.load(kwargs["speaker"])
119
  output_dict = self.model(batch, lids=lid, spembs=spk_embed)
120
  else:
121
  raise NotImplementedError(f"Model {self.model_id} not supported")
 
99
  return batch
100
 
101
  def synthesize(
102
+ self, score: list[tuple[float, float, str, int]], language: str, speaker: str, **kwargs
103
  ):
104
  batch = self._preprocess(score, language)
105
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
106
+ sid = np.array([int(speaker)])
107
  output_dict = self.model(batch, sids=sid)
108
  elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
109
  langs = {
 
115
  f"Unsupported language: {language} for {self.model_id}"
116
  )
117
  lid = np.array([langs[language]])
118
+ spk_embed = np.load(speaker)
119
  output_dict = self.model(batch, lids=lid, spembs=spk_embed)
120
  else:
121
  raise NotImplementedError(f"Model {self.model_id} not supported")
pipeline.py CHANGED
@@ -55,7 +55,7 @@ class SingingDialoguePipeline:
55
  audio_path,
56
  language,
57
  prompt_template,
58
- svs_inference_kwargs,
59
  max_new_tokens=100,
60
  ):
61
  if self.track_latency:
@@ -81,7 +81,7 @@ class SingingDialoguePipeline:
81
  if self.track_latency:
82
  svs_start_time = time.time()
83
  singing_audio, sample_rate = self.svs.synthesize(
84
- score, language=language, **svs_inference_kwargs
85
  )
86
  if self.track_latency:
87
  svs_end_time = time.time()
@@ -99,5 +99,5 @@ class SingingDialoguePipeline:
99
  })
100
  return results
101
 
102
- def evaluate(self, audio, sample_rate):
103
- return run_evaluation(audio, sample_rate, self.evaluators)
 
55
  audio_path,
56
  language,
57
  prompt_template,
58
+ speaker,
59
  max_new_tokens=100,
60
  ):
61
  if self.track_latency:
 
81
  if self.track_latency:
82
  svs_start_time = time.time()
83
  singing_audio, sample_rate = self.svs.synthesize(
84
+ score, language=language, speaker=speaker
85
  )
86
  if self.track_latency:
87
  svs_end_time = time.time()
 
99
  })
100
  return results
101
 
102
+ def evaluate(self, audio_path):
103
+ return run_evaluation(audio_path, self.evaluators)
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  git+https://github.com/espnet/espnet.git@3856d998ee0b2fa20f7b8fa48553754f33ed6e63
2
  espnet_model_zoo
3
- # pyopenjtalk
4
  datasets
5
  torchaudio
6
  typeguard==4.4.0
@@ -15,3 +15,6 @@ transformers
15
  s3prl
16
  zhconv
17
  git+https://github.com/sea-turt1e/kanjiconv
 
 
 
 
1
  git+https://github.com/espnet/espnet.git@3856d998ee0b2fa20f7b8fa48553754f33ed6e63
2
  espnet_model_zoo
3
+ pyopenjtalk
4
  datasets
5
  torchaudio
6
  typeguard==4.4.0
 
15
  s3prl
16
  zhconv
17
  git+https://github.com/sea-turt1e/kanjiconv
18
+ soundfile
19
+ PyYAML
20
+ gradio
tests/audio/chat.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:181a7f27f8acb00cba0276d0ff88759120a76eebd47b4e0a60c2424e43e5cbaf
3
+ size 271030
tests/audio/feeling.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fef036c2bf0ddf635a004845e94c89d0658f754a53e12fadbb50511d3cd6c15
3
+ size 263502
tests/audio/hello.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa7e839d32f7bda77cad11fc13fd1b92df939479612dd5af079d8f9b19598c0d
3
+ size 263502
tests/audio/interesting.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a1618f73d90ad068d5eb72455ac812b49fcb9e44e88af5e67ef88f5c6ddb74a
3
+ size 429086
tests/audio/music.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6388b587e282e8f6457b629b5cbb9fd50c5cb6a7f90c446329a3f23be8b1442c
3
+ size 286082
tests/audio/where_from.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ef81772b96813216d7b14d3d70a39b040e9c542d896d9337f8975f8fd6da96e
3
+ size 195766