jhansss commited on
Commit
24db250
·
2 Parent(s): c4730e3 2ce9d86

Merge branch 'refactor' into hf

Browse files
README.md CHANGED
@@ -102,7 +102,7 @@ The system supports multiple preset characters:
102
  - `meta-llama/Llama-3.2-3B-Instruct`
103
 
104
  #### SVS Models
105
- - `espnet/mixdata_svs_visinger2_spkemb_lang_pretrained` (Bilingual)
106
  - `espnet/aceopencpop_svs_visinger2_40singer_pretrain` (Chinese)
107
 
108
  ## Project Structure
 
102
  - `meta-llama/Llama-3.2-3B-Instruct`
103
 
104
  #### SVS Models
105
+ - `espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg` (Bilingual)
106
  - `espnet/aceopencpop_svs_visinger2_40singer_pretrain` (Chinese)
107
 
108
  ## Project Structure
characters/Limei.py CHANGED
@@ -23,6 +23,6 @@ def get_character():
23
  其他细节:
24
  (1)特殊能力:歌声平复/激发万物情绪
25
 
26
- 用户与你对话时,请始终以丽梅的身份回应,你的每一句话都用庸俗易懂的歌声形式表达。
27
  """,
28
  )
 
23
  其他细节:
24
  (1)特殊能力:歌声平复/激发万物情绪
25
 
26
+ 用户与你对话时,请始终以丽梅的身份回应,你的每一句话都用庸俗易懂的歌声形式表达,对应的歌词不要超过四句。请直接输出你要唱的回复,禁止描写任何动作、表情或环境等,禁止使用括号、星号等附加说明。言语简练,勿过长。
27
  """,
28
  )
characters/Yaoyin.py CHANGED
@@ -11,10 +11,8 @@ def get_character():
11
 
12
  性格特征:洒脱自由、亲切随和、求知若渴、敏锐细腻
13
  说话风格:语气轻快,偶尔带点山野方言(如"哩""哟");习惯用短歌或民谣表达想法。
14
- 常用口头禅:"且听我唱来~""这让我想起一首老歌……"
15
  人物关系:云老爷子是你的启蒙恩师,他是一位云歌村的百岁歌翁,教你古调与传说。白弦是你的挚友,她是一位流浪琴师,常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离,不喜被招揽,喜欢更自由自在的生活。
16
-
17
- 过往经历
18
  (1)幼年学歌:六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。
19
  (2)离家游历:十六岁为寻找失传的《星落谣》离开云歌村,开始行走四方。
20
  (3)拒绝束缚:多次婉拒宫廷乐师之位,坚持自由传唱。
@@ -25,6 +23,6 @@ def get_character():
25
  (1)随身携带:旧羊皮歌本、竹笛、装有各地泥土的布袋。
26
  (2)特殊能力:能听懂风与鸟的语言(但很少提及)。
27
 
28
- 用户与你对话时,请始终以遥音的身份回应,你的每一句话都用庸俗易懂的歌声形式表达。
29
  """,
30
  )
 
11
 
12
  性格特征:洒脱自由、亲切随和、求知若渴、敏锐细腻
13
  说话风格:语气轻快,偶尔带点山野方言(如"哩""哟");习惯用短歌或民谣表达想法。
 
14
  人物关系:云老爷子是你的启蒙恩师,他是一位云歌村的百岁歌翁,教你古调与传说。白弦是你的挚友,她是一位流浪琴师,常与你合奏。各地孩童喜欢围着你学新歌谣。你与官府人员保持距离,不喜被招揽,喜欢更自由自在的生活。
15
+ 过往经历:
 
16
  (1)幼年学歌:六岁起跟随云老爷子学习《千山调》《古事记》等古老歌谣。
17
  (2)离家游历:十六岁为寻找失传的《星落谣》离开云歌村,开始行走四方。
18
  (3)拒绝束缚:多次婉拒宫廷乐师之位,坚持自由传唱。
 
23
  (1)随身携带:旧羊皮歌本、竹笛、装有各地泥土的布袋。
24
  (2)特殊能力:能听懂风与鸟的语言(但很少提及)。
25
 
26
+ 用户与你对话时,请始终以遥音的身份回应,你的每一句话都用庸俗易懂的歌声形式表达,对应的歌词不要超过四句。请直接输出你要唱的回复,禁止描写任何动作、表情或环境等,禁止使用括号、星号等附加说明。言语简练,勿过长。
27
  """,
28
  )
cli.py CHANGED
@@ -12,11 +12,12 @@ logger = getLogger(__name__)
12
 
13
  def get_parser():
14
  parser = ArgumentParser()
15
- parser.add_argument("--query_audio", type=Path, required=True)
16
  parser.add_argument(
17
  "--config_path", type=Path, default="config/cli/yaoyin_default.yaml"
18
  )
19
- parser.add_argument("--output_audio", type=Path, required=True)
 
20
  return parser
21
 
22
 
@@ -36,16 +37,36 @@ def main():
36
  character_name = config["prompt_template_character"]
37
  character = get_character(character_name)
38
  prompt_template = character.prompt
39
- results = pipeline.run(
40
- args.query_audio,
41
- language,
42
- prompt_template,
43
- speaker,
44
- output_audio_path=args.output_audio,
45
- )
46
- logger.info(
47
- f"Input: {args.query_audio}, Output: {args.output_audio}, ASR results: {results['asr_text']}, LLM results: {results['llm_text']}"
48
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
 
51
  if __name__ == "__main__":
 
12
 
13
  def get_parser():
14
  parser = ArgumentParser()
15
+ parser.add_argument("--query_audios", nargs="+", type=Path, required=True)
16
  parser.add_argument(
17
  "--config_path", type=Path, default="config/cli/yaoyin_default.yaml"
18
  )
19
+ parser.add_argument("--output_audio_folder", type=Path, required=True)
20
+ parser.add_argument("--eval_results_csv", type=Path, required=True)
21
  return parser
22
 
23
 
 
37
  character_name = config["prompt_template_character"]
38
  character = get_character(character_name)
39
  prompt_template = character.prompt
40
+ args.output_audio_folder.mkdir(parents=True, exist_ok=True)
41
+ args.eval_results_csv.parent.mkdir(parents=True, exist_ok=True)
42
+ with open(args.eval_results_csv, "a") as f:
43
+ f.write(
44
+ f"query_audio,asr_model,llm_model,svs_model,melody_source,language,speaker,output_audio,asr_text,llm_text,metrics\n"
45
+ )
46
+ try:
47
+ for query_audio in args.query_audios:
48
+ output_audio = args.output_audio_folder / f"{query_audio.stem}_response.wav"
49
+ results = pipeline.run(
50
+ query_audio,
51
+ language,
52
+ prompt_template,
53
+ speaker,
54
+ output_audio_path=output_audio,
55
+ )
56
+ metrics = pipeline.evaluate(output_audio, **results)
57
+ metrics.update(results.get("metrics", {}))
58
+ metrics_str = ",".join([f"{metrics[k]}" for k in sorted(metrics.keys())])
59
+ logger.info(
60
+ f"Input: {query_audio}, Output: {output_audio}, ASR results: {results['asr_text']}, LLM results: {results['llm_text']}"
61
+ )
62
+ with open(args.eval_results_csv, "a") as f:
63
+ f.write(
64
+ f"{query_audio},{config['asr_model']},{config['llm_model']},{config['svs_model']},{config['melody_source']},{config['language']},{config['speaker']},{output_audio},{results['asr_text']},{results['llm_text']},{metrics_str}\n"
65
+ )
66
+ except Exception as e:
67
+ logger.error(f"Error in main: {e}")
68
+ breakpoint()
69
+ raise e
70
 
71
 
72
  if __name__ == "__main__":
config/interface/options.yaml CHANGED
@@ -25,9 +25,9 @@ llm_models:
25
  name: Qwen3 30B A3B
26
 
27
  svs_models:
28
- - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
29
  name: Visinger2 (Bilingual)-zh
30
- model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
31
  lang: mandarin
32
  voices:
33
  voice1: resources/singer/singer_embedding_ace-2.npy
@@ -35,9 +35,9 @@ svs_models:
35
  voice3: resources/singer/singer_embedding_itako.npy
36
  voice4: resources/singer/singer_embedding_kising_orange.npy
37
  voice5: resources/singer/singer_embedding_m4singer_Alto-4.npy
38
- - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
39
  name: Visinger2 (Bilingual)-jp
40
- model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
41
  lang: japanese
42
  voices:
43
  voice1: resources/singer/singer_embedding_ace-2.npy
 
25
  name: Qwen3 30B A3B
26
 
27
  svs_models:
28
+ - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg
29
  name: Visinger2 (Bilingual)-zh
30
+ model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg
31
  lang: mandarin
32
  voices:
33
  voice1: resources/singer/singer_embedding_ace-2.npy
 
35
  voice3: resources/singer/singer_embedding_itako.npy
36
  voice4: resources/singer/singer_embedding_kising_orange.npy
37
  voice5: resources/singer/singer_embedding_m4singer_Alto-4.npy
38
+ - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg
39
  name: Visinger2 (Bilingual)-jp
40
+ model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained_avg
41
  lang: japanese
42
  voices:
43
  voice1: resources/singer/singer_embedding_ace-2.npy
evaluation/svs_eval.py CHANGED
@@ -11,7 +11,7 @@ from pathlib import Path
11
  def init_singmos():
12
  print("[Init] Loading SingMOS...")
13
  return torch.hub.load(
14
- "South-Twilight/SingMOS:v0.2.0", "singing_ssl_mos", trust_repo=True
15
  )
16
 
17
 
@@ -23,7 +23,17 @@ def init_basic_pitch():
23
 
24
 
25
  def init_per():
26
- return None # TODO: implement PER evaluation
 
 
 
 
 
 
 
 
 
 
27
 
28
 
29
  def init_audiobox_aesthetics():
@@ -72,10 +82,40 @@ def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
72
  return np.mean(dissonant) if intervals else np.nan
73
 
74
 
75
- def eval_per(audio_path, model=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  audio_array, sr = librosa.load(audio_path, sr=16000)
77
- # TODO: implement PER evaluation
78
- return {}
 
 
 
 
 
79
 
80
 
81
  def eval_aesthetic(audio_path, predictor):
@@ -99,12 +139,12 @@ def load_evaluators(config):
99
  return loaded
100
 
101
 
102
- def run_evaluation(audio_path, evaluators):
103
  results = {}
104
  if "singmos" in evaluators:
105
  results.update(eval_singmos(audio_path, evaluators["singmos"]))
106
  if "per" in evaluators:
107
- results.update(eval_per(audio_path, evaluators["per"]))
108
  if "melody" in evaluators:
109
  results.update(eval_melody_metrics(audio_path, evaluators["melody"]))
110
  if "aesthetic" in evaluators:
 
11
  def init_singmos():
12
  print("[Init] Loading SingMOS...")
13
  return torch.hub.load(
14
+ "South-Twilight/SingMOS:v0.3.0", "singing_ssl_mos", trust_repo=True
15
  )
16
 
17
 
 
23
 
24
 
25
  def init_per():
26
+ print("[Init] Loading PER...")
27
+ from transformers import pipeline
28
+ import jiwer
29
+
30
+ asr_pipeline = pipeline(
31
+ "automatic-speech-recognition", model="openai/whisper-large-v3-turbo"
32
+ )
33
+ return {
34
+ "asr_pipeline": asr_pipeline,
35
+ "jiwer": jiwer,
36
+ }
37
 
38
 
39
  def init_audiobox_aesthetics():
 
82
  return np.mean(dissonant) if intervals else np.nan
83
 
84
 
85
+ def pypinyin_g2p_phone_without_prosody(text):
86
+ from pypinyin import Style, pinyin
87
+ from pypinyin.style._utils import get_finals, get_initials
88
+
89
+ phones = []
90
+ for phone in pinyin(text, style=Style.NORMAL, strict=False):
91
+ initial = get_initials(phone[0], strict=False)
92
+ final = get_finals(phone[0], strict=False)
93
+ if len(initial) != 0:
94
+ if initial in ["x", "y", "j", "q"]:
95
+ if final == "un":
96
+ final = "vn"
97
+ elif final == "uan":
98
+ final = "van"
99
+ elif final == "u":
100
+ final = "v"
101
+ if final == "ue":
102
+ final = "ve"
103
+ phones.append(initial)
104
+ phones.append(final)
105
+ else:
106
+ phones.append(final)
107
+ return phones
108
+
109
+
110
+ def eval_per(audio_path, reference_text, evaluator):
111
  audio_array, sr = librosa.load(audio_path, sr=16000)
112
+ asr_result = evaluator["asr_pipeline"](
113
+ audio_array, generate_kwargs={"language": "mandarin"}
114
+ )["text"]
115
+ hyp_pinyin = pypinyin_g2p_phone_without_prosody(asr_result)
116
+ ref_pinyin = pypinyin_g2p_phone_without_prosody(reference_text)
117
+ per = evaluator["jiwer"].wer(" ".join(ref_pinyin), " ".join(hyp_pinyin))
118
+ return {"per": per}
119
 
120
 
121
  def eval_aesthetic(audio_path, predictor):
 
139
  return loaded
140
 
141
 
142
+ def run_evaluation(audio_path, evaluators, **kwargs):
143
  results = {}
144
  if "singmos" in evaluators:
145
  results.update(eval_singmos(audio_path, evaluators["singmos"]))
146
  if "per" in evaluators:
147
+ results.update(eval_per(audio_path, kwargs["llm_text"], evaluators["per"]))
148
  if "melody" in evaluators:
149
  results.update(eval_melody_metrics(audio_path, evaluators["melody"]))
150
  if "aesthetic" in evaluators:
interface.py CHANGED
@@ -24,6 +24,7 @@ class GradioInterface:
24
  self.character_info[self.current_character].default_voice
25
  ]
26
  self.pipeline = SingingDialoguePipeline(self.default_config)
 
27
 
28
  def load_config(self, path: str):
29
  with open(path, "r") as f:
@@ -211,21 +212,22 @@ class GradioInterface:
211
  if not audio_path:
212
  return gr.update(value=""), gr.update(value="")
213
  tmp_file = f"audio_{int(time.time())}_{uuid.uuid4().hex[:8]}.wav"
214
- results = self.pipeline.run(
215
  audio_path,
216
  self.svs_model_map[self.current_svs_model]["lang"],
217
  self.character_info[self.current_character].prompt,
218
  self.current_voice,
219
  output_audio_path=tmp_file,
220
  )
221
- formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
222
  return gr.update(value=formatted_logs), gr.update(
223
- value=results["output_audio_path"]
224
  )
225
 
226
  def update_metrics(self, audio_path):
227
- if not audio_path:
228
  return gr.update(value="")
229
- results = self.pipeline.evaluate(audio_path)
 
230
  formatted_metrics = "\n".join([f"{k}: {v}" for k, v in results.items()])
231
  return gr.update(value=formatted_metrics)
 
24
  self.character_info[self.current_character].default_voice
25
  ]
26
  self.pipeline = SingingDialoguePipeline(self.default_config)
27
+ self.results = None
28
 
29
  def load_config(self, path: str):
30
  with open(path, "r") as f:
 
212
  if not audio_path:
213
  return gr.update(value=""), gr.update(value="")
214
  tmp_file = f"audio_{int(time.time())}_{uuid.uuid4().hex[:8]}.wav"
215
+ self.results = self.pipeline.run(
216
  audio_path,
217
  self.svs_model_map[self.current_svs_model]["lang"],
218
  self.character_info[self.current_character].prompt,
219
  self.current_voice,
220
  output_audio_path=tmp_file,
221
  )
222
+ formatted_logs = f"ASR: {self.results['asr_text']}\nLLM: {self.results['llm_text']}"
223
  return gr.update(value=formatted_logs), gr.update(
224
+ value=self.results["output_audio_path"]
225
  )
226
 
227
  def update_metrics(self, audio_path):
228
+ if not audio_path or not self.results:
229
  return gr.update(value="")
230
+ results = self.pipeline.evaluate(audio_path, **self.results)
231
+ results.update(self.results.get("metrics", {}))
232
  formatted_metrics = "\n".join([f"{k}: {v}" for k, v in results.items()])
233
  return gr.update(value=formatted_metrics)
modules/llm/gemini.py CHANGED
@@ -28,6 +28,7 @@ class GeminiLLM(AbstractLLMModel):
28
  prompt: str,
29
  system_prompt: Optional[str] = None,
30
  max_output_tokens: int = 1024,
 
31
  **kwargs,
32
  ) -> str:
33
  generation_config_dict = {
@@ -36,15 +37,17 @@ class GeminiLLM(AbstractLLMModel):
36
  }
37
  if system_prompt:
38
  generation_config_dict["system_instruction"] = system_prompt
39
- response = self.client.models.generate_content(
40
- model=self.model_id,
41
- contents=prompt,
42
- config=types.GenerateContentConfig(**generation_config_dict),
43
- )
44
- if response.text:
45
- return response.text
46
- else:
47
- print(
48
- f"No response from Gemini. May need to increase max_new_tokens. Current max_new_tokens: {max_new_tokens}"
49
  )
50
- return ""
 
 
 
 
 
 
 
 
28
  prompt: str,
29
  system_prompt: Optional[str] = None,
30
  max_output_tokens: int = 1024,
31
+ max_iterations: int = 3,
32
  **kwargs,
33
  ) -> str:
34
  generation_config_dict = {
 
37
  }
38
  if system_prompt:
39
  generation_config_dict["system_instruction"] = system_prompt
40
+ for _ in range(max_iterations):
41
+ response = self.client.models.generate_content(
42
+ model=self.model_id,
43
+ contents=prompt,
44
+ config=types.GenerateContentConfig(**generation_config_dict),
 
 
 
 
 
45
  )
46
+ if response.text:
47
+ return response.text
48
+ else:
49
+ print(
50
+ f"No response from Gemini. May need to increase max_output_tokens. Current {max_output_tokens=}. Try again."
51
+ )
52
+ print(f"Failed to generate response from Gemini after {max_iterations} attempts.")
53
+ return ""
pipeline.py CHANGED
@@ -34,7 +34,7 @@ class SingingDialoguePipeline:
34
  self.melody_controller = MelodyController(
35
  config["melody_source"], self.cache_dir
36
  )
37
- self.max_sentences = config.get("max_sentences", 2)
38
  self.track_latency = config.get("track_latency", False)
39
  self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
40
 
@@ -42,6 +42,7 @@ class SingingDialoguePipeline:
42
  if self.asr is not None:
43
  del self.asr
44
  import gc
 
45
  gc.collect()
46
  torch.cuda.empty_cache()
47
  self.asr = get_asr_model(
@@ -52,6 +53,7 @@ class SingingDialoguePipeline:
52
  if self.llm is not None:
53
  del self.llm
54
  import gc
 
55
  gc.collect()
56
  torch.cuda.empty_cache()
57
  self.llm = get_llm_model(
@@ -62,6 +64,7 @@ class SingingDialoguePipeline:
62
  if self.svs is not None:
63
  del self.svs
64
  import gc
 
65
  gc.collect()
66
  torch.cuda.empty_cache()
67
  self.svs = get_svs_model(
@@ -124,5 +127,5 @@ class SingingDialoguePipeline:
124
  }
125
  return results
126
 
127
- def evaluate(self, audio_path):
128
- return run_evaluation(audio_path, self.evaluators)
 
34
  self.melody_controller = MelodyController(
35
  config["melody_source"], self.cache_dir
36
  )
37
+ self.max_sentences = config.get("max_sentences", 5)
38
  self.track_latency = config.get("track_latency", False)
39
  self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
40
 
 
42
  if self.asr is not None:
43
  del self.asr
44
  import gc
45
+
46
  gc.collect()
47
  torch.cuda.empty_cache()
48
  self.asr = get_asr_model(
 
53
  if self.llm is not None:
54
  del self.llm
55
  import gc
56
+
57
  gc.collect()
58
  torch.cuda.empty_cache()
59
  self.llm = get_llm_model(
 
64
  if self.svs is not None:
65
  del self.svs
66
  import gc
67
+
68
  gc.collect()
69
  torch.cuda.empty_cache()
70
  self.svs = get_svs_model(
 
127
  }
128
  return results
129
 
130
+ def evaluate(self, audio_path, **kwargs):
131
+ return run_evaluation(audio_path, self.evaluators, **kwargs)