Han Jionghao commited on
Commit
be6640f
·
unverified ·
2 Parent(s): 629b906 be053b4

Merge branch 'refactor' into fwh_dev

Browse files
.gitattributes CHANGED
@@ -1,2 +1,4 @@
1
  *.png filter=lfs diff=lfs merge=lfs -text
2
  *.jpg filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  *.png filter=lfs diff=lfs merge=lfs -text
2
  *.jpg filter=lfs diff=lfs merge=lfs -text
3
+ *.wav filter=lfs diff=lfs merge=lfs -text
4
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SingingSDS: Role-Playing Singing Spoken Dialogue System
2
+
3
+ A role-playing singing dialogue system that converts speech input into character-based singing output.
4
+
5
+ ## Installation
6
+
7
+ ### Requirements
8
+
9
+ - Python 3.11+
10
+ - CUDA (optional, for GPU acceleration)
11
+
12
+ ### Install Dependencies
13
+
14
+ #### Option 1: Using Conda (Recommended)
15
+
16
+ ```bash
17
+ conda create -n singingsds python=3.11
18
+
19
+ conda activate singingsds
20
+ conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
21
+ pip install -r requirements.txt
22
+ ```
23
+
24
+ #### Option 2: Using pip only
25
+
26
+ ```bash
27
+ pip install -r requirements.txt
28
+ ```
29
+
30
+ #### Option 3: Using pip with virtual environment
31
+
32
+ ```bash
33
+ python -m venv singingsds_env
34
+
35
+ # On Windows:
36
+ singingsds_env\Scripts\activate
37
+ # On macOS/Linux:
38
+ source singingsds_env/bin/activate
39
+
40
+ pip install -r requirements.txt
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ ### Command Line Interface (CLI)
46
+
47
+ #### Example Usage
48
+
49
+ ```bash
50
+ python cli.py --query_audio tests/audio/hello.wav --config_path config/cli/yaoyin_default.yaml --output_audio outputs/yaoyin_hello.wav
51
+ ```
52
+
53
+ #### Parameter Description
54
+
55
+ - `--query_audio`: Input audio file path (required)
56
+ - `--config_path`: Configuration file path (default: config/cli/yaoyin_default.yaml)
57
+ - `--output_audio`: Output audio file path (required)
58
+
59
+
60
+ ### Web Interface (Gradio)
61
+
62
+ Start the web interface:
63
+
64
+ ```bash
65
+ python app.py
66
+ ```
67
+
68
+ Then visit the displayed address in your browser to use the graphical interface.
69
+
70
+ ## Configuration
71
+
72
+ ### Character Configuration
73
+
74
+ The system supports multiple preset characters:
75
+
76
+ - **Yaoyin (遥音)**: Default timbre is `timbre2`
77
+ - **Limei (丽梅)**: Default timbre is `timbre1`
78
+
79
+ ### Model Configuration
80
+
81
+ #### ASR Models
82
+ - `openai/whisper-large-v3-turbo`
83
+ - `openai/whisper-large-v3`
84
+ - `openai/whisper-medium`
85
+ - `sanchit-gandhi/whisper-small-dv`
86
+ - `facebook/wav2vec2-base-960h`
87
+
88
+ #### LLM Models
89
+ - `google/gemma-2-2b`
90
+ - `MiniMaxAI/MiniMax-M1-80k`
91
+ - `meta-llama/Llama-3.2-3B-Instruct`
92
+
93
+ #### SVS Models
94
+ - `espnet/mixdata_svs_visinger2_spkemb_lang_pretrained` (Bilingual)
95
+ - `espnet/aceopencpop_svs_visinger2_40singer_pretrain` (Chinese)
96
+
97
+ ## Project Structure
98
+
99
+ ```
100
+ SingingSDS/
101
+ ├── cli.py # Command line interface
102
+ ├── interface.py # Gradio interface
103
+ ├── pipeline.py # Core processing pipeline
104
+ ├── app.py # Web application entry
105
+ ├── requirements.txt # Python dependencies
106
+ ├── config/ # Configuration files
107
+ │ ├── cli/ # CLI-specific configuration
108
+ │ └── interface/ # Interface-specific configuration
109
+ ├── modules/ # Core modules
110
+ │ ├── asr.py # Speech recognition module
111
+ │ ├── llm.py # Large language model module
112
+ │ ├── melody.py # Melody control module
113
+ │ ├── svs/ # Singing voice synthesis modules
114
+ │ │ ├── base.py # Base SVS class
115
+ │ │ ├── espnet.py # ESPnet SVS implementation
116
+ │ │ ├── registry.py # SVS model registry
117
+ │ │ └── __init__.py # SVS module initialization
118
+ │ └── utils/ # Utility modules
119
+ │ ├── g2p.py # Grapheme-to-phoneme conversion
120
+ │ ├── text_normalize.py # Text normalization
121
+ │ └── resources/ # Utility resources
122
+ ├── characters/ # Character definitions
123
+ │ ├── base.py # Base character class
124
+ │ ├── Limei.py # Limei character definition
125
+ │ ├── Yaoyin.py # Yaoyin character definition
126
+ │ └── __init__.py # Character module initialization
127
+ ├── evaluation/ # Evaluation modules
128
+ │ └── svs_eval.py # SVS evaluation metrics
129
+ ├── data/ # Data directory
130
+ │ ├── kising/ # Kising dataset
131
+ │ └── touhou/ # Touhou dataset
132
+ ├── resources/ # Project resources
133
+ ├── data_handlers/ # Data handling utilities
134
+ ├── assets/ # Static assets
135
+ └── tests/ # Test files
136
+ ```
137
+
138
+ ## Contributing
139
+
140
+ Issues and Pull Requests are welcome!
141
+
142
+ ## License
143
+
144
+
app.py CHANGED
@@ -3,7 +3,8 @@ from interface import GradioInterface
3
 
4
  def main():
5
  demo = GradioInterface(
6
- options_config="config/options.yaml", default_config="config/default.yaml"
 
7
  ).create_interface()
8
  demo.launch()
9
 
 
3
 
4
  def main():
5
  demo = GradioInterface(
6
+ options_config="config/interface/options.yaml",
7
+ default_config="config/interface/default.yaml",
8
  ).create_interface()
9
  demo.launch()
10
 
characters/Limei.py CHANGED
@@ -5,7 +5,7 @@ def get_character():
5
  return Character(
6
  name="Limei (丽梅)",
7
  image_path="assets/character_limei.png",
8
- default_timbre="timbre1",
9
  prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
11
 
 
5
  return Character(
6
  name="Limei (丽梅)",
7
  image_path="assets/character_limei.png",
8
+ default_voice="voice1",
9
  prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
11
 
characters/Yaoyin.py CHANGED
@@ -5,7 +5,7 @@ def get_character():
5
  return Character(
6
  name="Yaoyin (遥音)",
7
  image_path="assets/character_yaoyin.jpg",
8
- default_timbre="timbre2",
9
  prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
11
 
 
5
  return Character(
6
  name="Yaoyin (遥音)",
7
  image_path="assets/character_yaoyin.jpg",
8
+ default_voice="voice2",
9
  prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
10
  你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
11
 
characters/base.py CHANGED
@@ -5,5 +5,5 @@ from dataclasses import dataclass
5
  class Character:
6
  name: str
7
  image_path: str
8
- default_timbre: str
9
  prompt: str
 
5
  class Character:
6
  name: str
7
  image_path: str
8
+ default_voice: str
9
  prompt: str
cli.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from argparse import ArgumentParser
2
+ from logging import getLogger
3
+ from pathlib import Path
4
+
5
+ import yaml
6
+
7
+ from characters import CHARACTERS
8
+ from pipeline import SingingDialoguePipeline
9
+
10
+ logger = getLogger(__name__)
11
+
12
+
13
+ def get_parser():
14
+ parser = ArgumentParser()
15
+ parser.add_argument("--query_audio", type=Path, required=True)
16
+ parser.add_argument(
17
+ "--config_path", type=Path, default="config/cli/yaoyin_default.yaml"
18
+ )
19
+ parser.add_argument("--output_audio", type=Path, required=True)
20
+ return parser
21
+
22
+
23
+ def load_config(config_path: Path):
24
+ with open(config_path, "r") as f:
25
+ config = yaml.safe_load(f)
26
+ return config
27
+
28
+
29
+ def main():
30
+ parser = get_parser()
31
+ args = parser.parse_args()
32
+ config = load_config(args.config_path)
33
+ pipeline = SingingDialoguePipeline(config)
34
+ speaker = config["speaker"]
35
+ language = config["language"]
36
+ character_name = config["prompt_template_character"]
37
+ character = CHARACTERS[character_name]
38
+ prompt_template = character.prompt
39
+ results = pipeline.run(
40
+ args.query_audio,
41
+ language,
42
+ prompt_template,
43
+ speaker,
44
+ output_audio_path=args.output_audio,
45
+ )
46
+ logger.info(
47
+ f"Input: {args.query_audio}, Output: {args.output_audio}, ASR results: {results['asr_text']}, LLM results: {results['llm_text']}"
48
+ )
49
+
50
+
51
+ if __name__ == "__main__":
52
+ main()
config/cli/limei_default.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_model: openai/whisper-large-v3-turbo
2
+ llm_model: google/gemma-2-2b
3
+ svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
+ melody_source: sample-lyric-kising
5
+ language: mandarin
6
+ prompt_template_character: Limei
7
+ speaker: 5
8
+ cache_dir: .cache
9
+
10
+ track_latency: True
11
+ evaluators:
12
+ svs:
13
+ - singmos
14
+ - per
15
+ - melody
16
+ - aesthetic
config/cli/yaoyin_default.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_model: openai/whisper-large-v3-turbo
2
+ llm_model: google/gemma-2-2b
3
+ svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
+ melody_source: sample-lyric-kising
5
+ language: mandarin
6
+ prompt_template_character: Yaoyin
7
+ speaker: 9
8
+ cache_dir: .cache
9
+
10
+ track_latency: True
11
+ evaluators:
12
+ svs:
13
+ - singmos
14
+ - per
15
+ - melody
16
+ - aesthetic
config/cli/yaoyin_test.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asr_model: openai/whisper-small
2
+ llm_model: google/gemma-2-2b
3
+ svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
4
+ melody_source: sample-lyric-kising
5
+ language: mandarin
6
+ max_sentences: 1
7
+ prompt_template_character: Yaoyin
8
+ speaker: 9
9
+ cache_dir: .cache
10
+
11
+ track_latency: True
config/{default.yaml → interface/default.yaml} RENAMED
File without changes
config/{options.yaml → interface/options.yaml} RENAMED
@@ -5,6 +5,8 @@ asr_models:
5
  name: Whisper large-v3
6
  - id: openai/whisper-medium
7
  name: Whisper medium
 
 
8
  - id: sanchit-gandhi/whisper-small-dv
9
  name: Whisper small-dv
10
  - id: facebook/wav2vec2-base-960h
@@ -15,38 +17,40 @@ llm_models:
15
  name: Gemma 2 2B
16
  - id: MiniMaxAI/MiniMax-M1-80k
17
  name: MiniMax M1 80k
 
 
18
 
19
  svs_models:
20
  - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
21
  name: Visinger2 (Bilingual)-zh
22
  model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
23
  lang: mandarin
24
- embeddings:
25
- timbre1: resource/singer/singer_embedding_ace-2.npy
26
- timbre2: resource/singer/singer_embedding_ace-8.npy
27
- timbre3: resource/singer/singer_embedding_itako.npy
28
- timbre4: resource/singer/singer_embedding_kising_orange.npy
29
- timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
30
  - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
31
  name: Visinger2 (Bilingual)-jp
32
  model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
33
  lang: japanese
34
- embeddings:
35
- timbre1: resource/singer/singer_embedding_ace-2.npy
36
- timbre2: resource/singer/singer_embedding_ace-8.npy
37
- timbre3: resource/singer/singer_embedding_itako.npy
38
- timbre4: resource/singer/singer_embedding_kising_orange.npy
39
- timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
40
  - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
41
  name: Visinger2 (Chinese)
42
  model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
43
  lang: mandarin
44
- embeddings:
45
- timbre1: 5
46
- timbre2: 8
47
- timbre3: 12
48
- timbre4: 15
49
- timbre5: 29
50
 
51
  melody_sources:
52
  - id: gen-random-none
 
5
  name: Whisper large-v3
6
  - id: openai/whisper-medium
7
  name: Whisper medium
8
+ - id: openai/whisper-small
9
+ name: Whisper small
10
  - id: sanchit-gandhi/whisper-small-dv
11
  name: Whisper small-dv
12
  - id: facebook/wav2vec2-base-960h
 
17
  name: Gemma 2 2B
18
  - id: MiniMaxAI/MiniMax-M1-80k
19
  name: MiniMax M1 80k
20
+ - id: meta-llama/Llama-3.2-3B-Instruct
21
+ name: Llama 3.2 3B Instruct
22
 
23
  svs_models:
24
  - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
25
  name: Visinger2 (Bilingual)-zh
26
  model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
27
  lang: mandarin
28
+ voices:
29
+ voice1: resource/singer/singer_embedding_ace-2.npy
30
+ voice2: resource/singer/singer_embedding_ace-8.npy
31
+ voice3: resource/singer/singer_embedding_itako.npy
32
+ voice4: resource/singer/singer_embedding_kising_orange.npy
33
+ voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
34
  - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
35
  name: Visinger2 (Bilingual)-jp
36
  model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
37
  lang: japanese
38
+ voices:
39
+ voice1: resource/singer/singer_embedding_ace-2.npy
40
+ voice2: resource/singer/singer_embedding_ace-8.npy
41
+ voice3: resource/singer/singer_embedding_itako.npy
42
+ voice4: resource/singer/singer_embedding_kising_orange.npy
43
+ voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
44
  - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
45
  name: Visinger2 (Chinese)
46
  model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
47
  lang: mandarin
48
+ voices:
49
+ voice1: 5
50
+ voice2: 8
51
+ voice3: 12
52
+ voice4: 15
53
+ voice5: 29
54
 
55
  melody_sources:
56
  - id: gen-random-none
evaluation/svs_eval.py CHANGED
@@ -37,7 +37,8 @@ def init_audiobox_aesthetics():
37
  # ----------- Evaluation -----------
38
 
39
 
40
- def eval_singmos(audio_array, sr, predictor):
 
41
  wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
42
  wav_tensor = torch.from_numpy(wav).unsqueeze(0)
43
  length_tensor = torch.tensor([wav_tensor.shape[1]])
@@ -71,7 +72,8 @@ def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
71
  return np.mean(dissonant) if intervals else np.nan
72
 
73
 
74
- def eval_per(audio_array, sr, model=None):
 
75
  # TODO: implement PER evaluation
76
  return {}
77
 
@@ -97,20 +99,16 @@ def load_evaluators(config):
97
  return loaded
98
 
99
 
100
- def run_evaluation(audio_array, sr, evaluators):
101
  results = {}
102
  if "singmos" in evaluators:
103
- results.update(eval_singmos(audio_array, sr, evaluators["singmos"]))
104
  if "per" in evaluators:
105
- results.update(eval_per(audio_array, sr, evaluators["per"]))
106
- # create a tmp file with unique name
107
- tmp_path = Path(".tmp") / f"{uuid.uuid4()}.wav"
108
- sf.write(tmp_path, audio_array, sr)
109
  if "melody" in evaluators:
110
- results.update(eval_melody_metrics(tmp_path, evaluators["melody"]))
111
  if "aesthetic" in evaluators:
112
- results.update(eval_aesthetic(tmp_path, evaluators["aesthetic"]))
113
- tmp_path.unlink()
114
  return results
115
 
116
 
@@ -122,9 +120,8 @@ if __name__ == "__main__":
122
  parser.add_argument("--results_csv", type=str, required=True)
123
  parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
124
  args = parser.parse_args()
125
- audio_array, sr = librosa.load(args.wav_path, sr=None)
126
  evaluators = load_evaluators(args.evaluators.split(","))
127
- results = run_evaluation(audio_array, sr, evaluators)
128
  print(results)
129
 
130
  with open(args.results_csv, "a") as f:
 
37
  # ----------- Evaluation -----------
38
 
39
 
40
+ def eval_singmos(audio_path, predictor):
41
+ audio_array, sr = librosa.load(audio_path, sr=44100)
42
  wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
43
  wav_tensor = torch.from_numpy(wav).unsqueeze(0)
44
  length_tensor = torch.tensor([wav_tensor.shape[1]])
 
72
  return np.mean(dissonant) if intervals else np.nan
73
 
74
 
75
+ def eval_per(audio_path, model=None):
76
+ audio_array, sr = librosa.load(audio_path, sr=16000)
77
  # TODO: implement PER evaluation
78
  return {}
79
 
 
99
  return loaded
100
 
101
 
102
+ def run_evaluation(audio_path, evaluators):
103
  results = {}
104
  if "singmos" in evaluators:
105
+ results.update(eval_singmos(audio_path, evaluators["singmos"]))
106
  if "per" in evaluators:
107
+ results.update(eval_per(audio_path, evaluators["per"]))
 
 
 
108
  if "melody" in evaluators:
109
+ results.update(eval_melody_metrics(audio_path, evaluators["melody"]))
110
  if "aesthetic" in evaluators:
111
+ results.update(eval_aesthetic(audio_path, evaluators["aesthetic"]))
 
112
  return results
113
 
114
 
 
120
  parser.add_argument("--results_csv", type=str, required=True)
121
  parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
122
  args = parser.parse_args()
 
123
  evaluators = load_evaluators(args.evaluators.split(","))
124
+ results = run_evaluation(args.wav_path, evaluators)
125
  print(results)
126
 
127
  with open(args.results_csv, "a") as f:
interface.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import gradio as gr
2
  import yaml
3
 
@@ -17,8 +20,8 @@ class GradioInterface:
17
  self.current_svs_model = (
18
  f"{self.default_config['language']}-{self.default_config['svs_model']}"
19
  )
20
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
21
- self.character_info[self.current_character].default_timbre
22
  ]
23
  self.pipeline = SingingDialoguePipeline(self.default_config)
24
 
@@ -104,21 +107,21 @@ class GradioInterface:
104
  value=self.current_svs_model,
105
  )
106
  with gr.Row():
107
- timbre_radio = gr.Radio(
108
- label="Singing Timbre",
109
  choices=list(
110
  self.svs_model_map[self.current_svs_model][
111
- "embeddings"
112
  ].keys()
113
  ),
114
  value=self.character_info[
115
  self.current_character
116
- ].default_timbre,
117
  )
118
  character_radio.change(
119
  fn=self.update_character,
120
  inputs=character_radio,
121
- outputs=[character_image, timbre_radio],
122
  )
123
  asr_radio.change(
124
  fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
@@ -129,35 +132,41 @@ class GradioInterface:
129
  svs_radio.change(
130
  fn=self.update_svs_model,
131
  inputs=svs_radio,
132
- outputs=[svs_radio, timbre_radio],
133
  )
134
  melody_radio.change(
135
  fn=self.update_melody_source,
136
  inputs=melody_radio,
137
  outputs=melody_radio,
138
  )
139
- timbre_radio.change(
140
- fn=self.update_timbre, inputs=timbre_radio, outputs=timbre_radio
141
  )
142
  mic_input.change(
143
  fn=self.run_pipeline,
144
  inputs=mic_input,
145
  outputs=[interaction_log, audio_output],
146
  )
 
 
 
 
 
147
 
148
  return demo
149
  except Exception as e:
150
  print(f"error: {e}")
151
  breakpoint()
 
152
 
153
  def update_character(self, character):
154
  self.current_character = character
155
- character_timbre = self.character_info[self.current_character].default_timbre
156
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
157
- character_timbre
158
  ]
159
  return gr.update(value=self.character_info[character].image_path), gr.update(
160
- value=character_timbre
161
  )
162
 
163
  def update_asr_model(self, asr_model):
@@ -170,23 +179,23 @@ class GradioInterface:
170
 
171
  def update_svs_model(self, svs_model):
172
  self.current_svs_model = svs_model
173
- character_timbre = self.character_info[self.current_character].default_timbre
174
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
175
- character_timbre
176
  ]
177
  self.pipeline.set_svs_model(
178
  self.svs_model_map[self.current_svs_model]["model_path"]
179
  )
180
  print(
181
- f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and timbre_radio to {character_timbre}"
182
  )
183
  return (
184
  gr.update(value=svs_model),
185
  gr.update(
186
  choices=list(
187
- self.svs_model_map[self.current_svs_model]["embeddings"].keys()
188
  ),
189
- value=character_timbre,
190
  ),
191
  )
192
 
@@ -194,24 +203,30 @@ class GradioInterface:
194
  self.current_melody_source = melody_source
195
  return gr.update(value=self.current_melody_source)
196
 
197
- def update_timbre(self, timbre):
198
- self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
199
- timbre
200
- ]
201
- return gr.update(value=timbre)
202
 
203
  def run_pipeline(self, audio_path):
 
 
 
204
  results = self.pipeline.run(
205
  audio_path,
206
  self.svs_model_map[self.current_svs_model]["lang"],
207
  self.character_info[self.current_character].prompt,
208
- svs_inference_kwargs={
209
- "speaker": self.current_timbre,
210
- },
211
- max_new_tokens=100,
212
  )
213
  formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
214
- return gr.update(value=formatted_logs), gr.update(value=results["svs_audio"])
 
 
215
 
216
- def run_evaluation(self, audio, audio_sample_rate):
217
- pass
 
 
 
 
 
1
+ import time
2
+ import uuid
3
+
4
  import gradio as gr
5
  import yaml
6
 
 
20
  self.current_svs_model = (
21
  f"{self.default_config['language']}-{self.default_config['svs_model']}"
22
  )
23
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
24
+ self.character_info[self.current_character].default_voice
25
  ]
26
  self.pipeline = SingingDialoguePipeline(self.default_config)
27
 
 
107
  value=self.current_svs_model,
108
  )
109
  with gr.Row():
110
+ voice_radio = gr.Radio(
111
+ label="Singing voice",
112
  choices=list(
113
  self.svs_model_map[self.current_svs_model][
114
+ "voices"
115
  ].keys()
116
  ),
117
  value=self.character_info[
118
  self.current_character
119
+ ].default_voice,
120
  )
121
  character_radio.change(
122
  fn=self.update_character,
123
  inputs=character_radio,
124
+ outputs=[character_image, voice_radio],
125
  )
126
  asr_radio.change(
127
  fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
 
132
  svs_radio.change(
133
  fn=self.update_svs_model,
134
  inputs=svs_radio,
135
+ outputs=[svs_radio, voice_radio],
136
  )
137
  melody_radio.change(
138
  fn=self.update_melody_source,
139
  inputs=melody_radio,
140
  outputs=melody_radio,
141
  )
142
+ voice_radio.change(
143
+ fn=self.update_voice, inputs=voice_radio, outputs=voice_radio
144
  )
145
  mic_input.change(
146
  fn=self.run_pipeline,
147
  inputs=mic_input,
148
  outputs=[interaction_log, audio_output],
149
  )
150
+ metrics_button.click(
151
+ fn=self.update_metrics,
152
+ inputs=audio_output,
153
+ outputs=[metrics_output],
154
+ )
155
 
156
  return demo
157
  except Exception as e:
158
  print(f"error: {e}")
159
  breakpoint()
160
+ return gr.Blocks()
161
 
162
  def update_character(self, character):
163
  self.current_character = character
164
+ character_voice = self.character_info[self.current_character].default_voice
165
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
166
+ character_voice
167
  ]
168
  return gr.update(value=self.character_info[character].image_path), gr.update(
169
+ value=character_voice
170
  )
171
 
172
  def update_asr_model(self, asr_model):
 
179
 
180
  def update_svs_model(self, svs_model):
181
  self.current_svs_model = svs_model
182
+ character_voice = self.character_info[self.current_character].default_voice
183
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
184
+ character_voice
185
  ]
186
  self.pipeline.set_svs_model(
187
  self.svs_model_map[self.current_svs_model]["model_path"]
188
  )
189
  print(
190
+ f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and voice_radio to {character_voice}"
191
  )
192
  return (
193
  gr.update(value=svs_model),
194
  gr.update(
195
  choices=list(
196
+ self.svs_model_map[self.current_svs_model]["voices"].keys()
197
  ),
198
+ value=character_voice,
199
  ),
200
  )
201
 
 
203
  self.current_melody_source = melody_source
204
  return gr.update(value=self.current_melody_source)
205
 
206
+ def update_voice(self, voice):
207
+ self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][voice]
208
+ return gr.update(value=voice)
 
 
209
 
210
  def run_pipeline(self, audio_path):
211
+ if not audio_path:
212
+ return gr.update(value=""), gr.update(value="")
213
+ tmp_file = f"audio_{int(time.time())}_{uuid.uuid4().hex[:8]}.wav"
214
  results = self.pipeline.run(
215
  audio_path,
216
  self.svs_model_map[self.current_svs_model]["lang"],
217
  self.character_info[self.current_character].prompt,
218
+ self.current_voice,
219
+ output_audio_path=tmp_file,
220
+ max_new_tokens=50,
 
221
  )
222
  formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
223
+ return gr.update(value=formatted_logs), gr.update(
224
+ value=results["output_audio_path"]
225
+ )
226
 
227
+ def update_metrics(self, audio_path):
228
+ if not audio_path:
229
+ return gr.update(value="")
230
+ results = self.pipeline.evaluate(audio_path)
231
+ formatted_metrics = "\n".join([f"{k}: {v}" for k, v in results.items()])
232
+ return gr.update(value=formatted_metrics)
modules/asr.py CHANGED
@@ -1,5 +1,4 @@
1
- from __future__ import annotations
2
-
3
  from abc import ABC, abstractmethod
4
 
5
  import librosa
@@ -7,17 +6,17 @@ import numpy as np
7
  from transformers import pipeline
8
 
9
  ASR_MODEL_REGISTRY = {}
 
10
 
11
 
12
  class AbstractASRModel(ABC):
13
- @abstractmethod
14
  def __init__(
15
  self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
16
  ):
 
17
  self.model_id = model_id
18
  self.device = device
19
  self.cache_dir = cache_dir
20
- pass
21
 
22
  @abstractmethod
23
  def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
@@ -52,15 +51,11 @@ class WhisperASR(AbstractASRModel):
52
  "automatic-speech-recognition",
53
  model=model_id,
54
  device=0 if device == "cuda" else -1,
 
55
  **kwargs,
56
  )
57
 
58
  def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
59
  if audio_sample_rate != 16000:
60
- try:
61
- audio, _ = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
62
- except Exception as e:
63
- breakpoint()
64
- print(f"Error resampling audio: {e}")
65
- audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
66
- return self.pipe(audio, generate_kwargs={"language": language}).get("text", "")
 
1
+ import os
 
2
  from abc import ABC, abstractmethod
3
 
4
  import librosa
 
6
  from transformers import pipeline
7
 
8
  ASR_MODEL_REGISTRY = {}
9
+ hf_token = os.getenv("HF_TOKEN")
10
 
11
 
12
  class AbstractASRModel(ABC):
 
13
  def __init__(
14
  self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
15
  ):
16
+ print(f"Loading ASR model {model_id}...")
17
  self.model_id = model_id
18
  self.device = device
19
  self.cache_dir = cache_dir
 
20
 
21
  @abstractmethod
22
  def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
 
51
  "automatic-speech-recognition",
52
  model=model_id,
53
  device=0 if device == "cuda" else -1,
54
+ token=hf_token,
55
  **kwargs,
56
  )
57
 
58
  def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
59
  if audio_sample_rate != 16000:
60
+ audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
61
+ return self.pipe(audio, generate_kwargs={"language": language}, return_timestamps=False).get("text", "")
 
 
 
 
 
modules/llm/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base import AbstractLLMModel
2
+ from .registry import LLM_MODEL_REGISTRY, get_llm_model, register_llm_model
3
+ from .hf_pipeline import HFTextGenerationLLM
4
+ from .qwen import QwenLLM
5
+
6
+ __all__ = [
7
+ "AbstractLLMModel",
8
+ "get_llm_model",
9
+ "register_llm_model",
10
+ "LLM_MODEL_REGISTRY",
11
+ ]
modules/llm/base.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class AbstractLLMModel(ABC):
5
+ def __init__(
6
+ self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
7
+ ):
8
+ print(f"Loading LLM model {model_id}...")
9
+ self.model_id = model_id
10
+ self.device = device
11
+ self.cache_dir = cache_dir
12
+
13
+ @abstractmethod
14
+ def generate(self, prompt: str, **kwargs) -> str:
15
+ pass
modules/{llm.py → llm/hf_pipeline.py} RENAMED
@@ -1,44 +1,21 @@
1
- from abc import ABC, abstractmethod
2
 
3
  from transformers import pipeline
4
 
5
- LLM_MODEL_REGISTRY = {}
 
6
 
7
-
8
- class AbstractLLMModel(ABC):
9
- @abstractmethod
10
- def __init__(
11
- self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
12
- ): ...
13
-
14
- @abstractmethod
15
- def generate(self, prompt: str, **kwargs) -> str:
16
- pass
17
-
18
-
19
- def register_llm_model(prefix: str):
20
- def wrapper(cls):
21
- assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel"
22
- LLM_MODEL_REGISTRY[prefix] = cls
23
- return cls
24
-
25
- return wrapper
26
-
27
-
28
- def get_llm_model(model_id: str, device="cpu", **kwargs) -> AbstractLLMModel:
29
- for prefix, cls in LLM_MODEL_REGISTRY.items():
30
- if model_id.startswith(prefix):
31
- return cls(model_id, device=device, **kwargs)
32
- raise ValueError(f"No LLM wrapper found for model: {model_id}")
33
 
34
 
35
- @register_llm_model("google/gemma")
36
- @register_llm_model("tii/") # e.g., Falcon
37
- @register_llm_model("meta-llama")
38
  class HFTextGenerationLLM(AbstractLLMModel):
39
  def __init__(
40
  self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
41
  ):
 
42
  model_kwargs = kwargs.setdefault("model_kwargs", {})
43
  model_kwargs["cache_dir"] = cache_dir
44
  self.pipe = pipeline(
@@ -46,6 +23,8 @@ class HFTextGenerationLLM(AbstractLLMModel):
46
  model=model_id,
47
  device=0 if device == "cuda" else -1,
48
  return_full_text=False,
 
 
49
  **kwargs,
50
  )
51
 
 
1
+ import os
2
 
3
  from transformers import pipeline
4
 
5
+ from .base import AbstractLLMModel
6
+ from .registry import register_llm_model
7
 
8
+ hf_token = os.getenv("HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
+ @register_llm_model("openai-community/")
12
+ @register_llm_model("google/gemma-")
13
+ @register_llm_model("meta-llama/Llama-")
14
  class HFTextGenerationLLM(AbstractLLMModel):
15
  def __init__(
16
  self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
17
  ):
18
+ super().__init__(model_id, device, cache_dir, **kwargs)
19
  model_kwargs = kwargs.setdefault("model_kwargs", {})
20
  model_kwargs["cache_dir"] = cache_dir
21
  self.pipe = pipeline(
 
23
  model=model_id,
24
  device=0 if device == "cuda" else -1,
25
  return_full_text=False,
26
+ token=hf_token,
27
+ trust_remote_code=True,
28
  **kwargs,
29
  )
30
 
modules/llm/registry.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base import AbstractLLMModel
2
+
3
+ LLM_MODEL_REGISTRY = {}
4
+
5
+
6
+ def register_llm_model(prefix: str):
7
+ def wrapper(cls):
8
+ assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel"
9
+ LLM_MODEL_REGISTRY[prefix] = cls
10
+ return cls
11
+
12
+ return wrapper
13
+
14
+
15
+ def get_llm_model(model_id: str, device="cpu", **kwargs) -> AbstractLLMModel:
16
+ for prefix, cls in LLM_MODEL_REGISTRY.items():
17
+ if model_id.startswith(prefix):
18
+ return cls(model_id, device=device, **kwargs)
19
+ raise ValueError(f"No LLM wrapper found for model: {model_id}")
modules/melody.py CHANGED
@@ -109,9 +109,10 @@ class MelodyController:
109
  if pitch == 0:
110
  score.append((st, ed, ref_lyric, pitch))
111
  elif ref_lyric in ["-", "——"] and align_type == "lyric":
112
- score.append((st, ed, ref_lyric, pitch))
113
- text_idx += 1
114
  else:
115
  score.append((st, ed, text_list[text_idx], pitch))
116
  text_idx += 1
 
 
117
  return score
 
109
  if pitch == 0:
110
  score.append((st, ed, ref_lyric, pitch))
111
  elif ref_lyric in ["-", "——"] and align_type == "lyric":
112
+ score.append((st, ed, "-", pitch))
 
113
  else:
114
  score.append((st, ed, text_list[text_idx], pitch))
115
  text_idx += 1
116
+ if text_idx >= len(text_list):
117
+ break
118
  return score
modules/svs/base.py CHANGED
@@ -13,6 +13,8 @@ class AbstractSVSModel(ABC):
13
  def synthesize(
14
  self,
15
  score: list[tuple[float, float, str, int]],
 
 
16
  **kwargs,
17
  ) -> tuple[np.ndarray, int]:
18
  """
 
13
  def synthesize(
14
  self,
15
  score: list[tuple[float, float, str, int]],
16
+ language: str,
17
+ speaker: str,
18
  **kwargs,
19
  ) -> tuple[np.ndarray, int]:
20
  """
modules/svs/espnet.py CHANGED
@@ -53,7 +53,7 @@ class ESPNetSVS(AbstractSVSModel):
53
  phoneme_mappers = {}
54
  return phoneme_mappers
55
 
56
- def _preprocess(self, score: list[tuple[float, float, str, int]], language: str):
57
  if language not in self.phoneme_mappers:
58
  raise ValueError(f"Unsupported language: {language} for {self.model_id}")
59
  phoneme_mapper = self.phoneme_mappers[language]
@@ -90,20 +90,20 @@ class ESPNetSVS(AbstractSVSModel):
90
  pre_phn = phn_units[-1]
91
 
92
  batch = {
93
- "score": {
94
- "tempo": 120, # does not affect svs result, as note durations are in time unit
95
- "notes": notes,
96
- },
97
  "text": " ".join(phns),
98
  }
99
  return batch
100
 
101
  def synthesize(
102
- self, score: list[tuple[float, float, str, int]], language: str, **kwargs
103
  ):
104
  batch = self._preprocess(score, language)
105
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
106
- sid = np.array([int(kwargs["speaker"])])
107
  output_dict = self.model(batch, sids=sid)
108
  elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
109
  langs = {
@@ -115,7 +115,7 @@ class ESPNetSVS(AbstractSVSModel):
115
  f"Unsupported language: {language} for {self.model_id}"
116
  )
117
  lid = np.array([langs[language]])
118
- spk_embed = np.load(kwargs["speaker"])
119
  output_dict = self.model(batch, lids=lid, spembs=spk_embed)
120
  else:
121
  raise NotImplementedError(f"Model {self.model_id} not supported")
 
53
  phoneme_mappers = {}
54
  return phoneme_mappers
55
 
56
+ def _preprocess(self, score: list[tuple[float, float, str, int] | tuple[float, float, str, float]], language: str):
57
  if language not in self.phoneme_mappers:
58
  raise ValueError(f"Unsupported language: {language} for {self.model_id}")
59
  phoneme_mapper = self.phoneme_mappers[language]
 
90
  pre_phn = phn_units[-1]
91
 
92
  batch = {
93
+ "score": (
94
+ 120, # does not affect svs result, as note durations are in time unit
95
+ notes,
96
+ ),
97
  "text": " ".join(phns),
98
  }
99
  return batch
100
 
101
  def synthesize(
102
+ self, score: list[tuple[float, float, str, float] | tuple[float, float, str, int]], language: str, speaker: str, **kwargs
103
  ):
104
  batch = self._preprocess(score, language)
105
  if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
106
+ sid = np.array([int(speaker)])
107
  output_dict = self.model(batch, sids=sid)
108
  elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
109
  langs = {
 
115
  f"Unsupported language: {language} for {self.model_id}"
116
  )
117
  lid = np.array([langs[language]])
118
+ spk_embed = np.load(speaker)
119
  output_dict = self.model(batch, lids=lid, spembs=spk_embed)
120
  else:
121
  raise NotImplementedError(f"Model {self.model_id} not supported")
modules/utils/g2p.py CHANGED
@@ -32,6 +32,7 @@ for plan in ace_phonemes_all_plans["plans"]:
32
 
33
 
34
  def preprocess_text(text: str, language: str) -> list[str]:
 
35
  if language == "mandarin":
36
  text_list = to_pinyin(text)
37
  elif language == "japanese":
 
32
 
33
 
34
  def preprocess_text(text: str, language: str) -> list[str]:
35
+ text = text.replace(" ", "")
36
  if language == "mandarin":
37
  text_list = to_pinyin(text)
38
  elif language == "japanese":
modules/utils/text_normalize.py CHANGED
@@ -3,12 +3,13 @@ from typing import Optional
3
 
4
 
5
  def remove_non_zh_jp(text: str) -> str:
6
- pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef]"
7
  return re.sub(pattern, "", text)
8
 
9
 
10
  def truncate_sentences(text: str, max_sentences: int) -> str:
11
- sentences = re.split(r"(?<=[。!?])", text)
 
12
  return "".join(sentences[:max_sentences]).strip()
13
 
14
 
 
3
 
4
 
5
  def remove_non_zh_jp(text: str) -> str:
6
+ pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef\s]"
7
  return re.sub(pattern, "", text)
8
 
9
 
10
  def truncate_sentences(text: str, max_sentences: int) -> str:
11
+ sentences = re.split(r"(?<=[。!?!?~])|(?:\n+)|(?: {2,})", text)
12
+ sentences = [s.strip() for s in sentences if s.strip()]
13
  return "".join(sentences[:max_sentences]).strip()
14
 
15
 
pipeline.py CHANGED
@@ -1,6 +1,11 @@
1
- import torch
 
2
  import time
 
 
3
  import librosa
 
 
4
 
5
  from modules.asr import get_asr_model
6
  from modules.llm import get_llm_model
@@ -29,6 +34,7 @@ class SingingDialoguePipeline:
29
  self.melody_controller = MelodyController(
30
  config["melody_source"], self.cache_dir
31
  )
 
32
  self.track_latency = config.get("track_latency", False)
33
  self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
34
 
@@ -55,8 +61,9 @@ class SingingDialoguePipeline:
55
  audio_path,
56
  language,
57
  prompt_template,
58
- svs_inference_kwargs,
59
- max_new_tokens=100,
 
60
  ):
61
  if self.track_latency:
62
  asr_start_time = time.time()
@@ -75,13 +82,14 @@ class SingingDialoguePipeline:
75
  if self.track_latency:
76
  llm_end_time = time.time()
77
  llm_latency = llm_end_time - llm_start_time
78
- print(f"llm output: {output}确认一下是不是不含prompt的")
79
- llm_response = clean_llm_output(output, language=language)
 
80
  score = self.melody_controller.generate_score(llm_response, language)
81
  if self.track_latency:
82
  svs_start_time = time.time()
83
  singing_audio, sample_rate = self.svs.synthesize(
84
- score, language=language, **svs_inference_kwargs
85
  )
86
  if self.track_latency:
87
  svs_end_time = time.time()
@@ -89,15 +97,19 @@ class SingingDialoguePipeline:
89
  results = {
90
  "asr_text": asr_result,
91
  "llm_text": llm_response,
92
- "svs_audio": (singing_audio, sample_rate),
93
  }
 
 
 
 
94
  if self.track_latency:
95
- results["metrics"].update({
96
  "asr_latency": asr_latency,
97
  "llm_latency": llm_latency,
98
  "svs_latency": svs_latency,
99
- })
100
  return results
101
 
102
- def evaluate(self, audio, sample_rate):
103
- return run_evaluation(audio, sample_rate, self.evaluators)
 
1
+ from __future__ import annotations
2
+
3
  import time
4
+ from pathlib import Path
5
+
6
  import librosa
7
+ import soundfile as sf
8
+ import torch
9
 
10
  from modules.asr import get_asr_model
11
  from modules.llm import get_llm_model
 
34
  self.melody_controller = MelodyController(
35
  config["melody_source"], self.cache_dir
36
  )
37
+ self.max_sentences = config.get("max_sentences", 2)
38
  self.track_latency = config.get("track_latency", False)
39
  self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
40
 
 
61
  audio_path,
62
  language,
63
  prompt_template,
64
+ speaker,
65
+ output_audio_path: Path | str = None,
66
+ max_new_tokens=50,
67
  ):
68
  if self.track_latency:
69
  asr_start_time = time.time()
 
82
  if self.track_latency:
83
  llm_end_time = time.time()
84
  llm_latency = llm_end_time - llm_start_time
85
+ llm_response = clean_llm_output(
86
+ output, language=language, max_sentences=self.max_sentences
87
+ )
88
  score = self.melody_controller.generate_score(llm_response, language)
89
  if self.track_latency:
90
  svs_start_time = time.time()
91
  singing_audio, sample_rate = self.svs.synthesize(
92
+ score, language=language, speaker=speaker
93
  )
94
  if self.track_latency:
95
  svs_end_time = time.time()
 
97
  results = {
98
  "asr_text": asr_result,
99
  "llm_text": llm_response,
100
+ "svs_audio": (sample_rate, singing_audio),
101
  }
102
+ if output_audio_path:
103
+ Path(output_audio_path).parent.mkdir(parents=True, exist_ok=True)
104
+ sf.write(output_audio_path, singing_audio, sample_rate)
105
+ results["output_audio_path"] = output_audio_path
106
  if self.track_latency:
107
+ results["metrics"] = {
108
  "asr_latency": asr_latency,
109
  "llm_latency": llm_latency,
110
  "svs_latency": svs_latency,
111
+ }
112
  return results
113
 
114
+ def evaluate(self, audio_path):
115
+ return run_evaluation(audio_path, self.evaluators)
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  git+https://github.com/espnet/espnet
2
  espnet_model_zoo
3
- # pyopenjtalk
4
  datasets
5
  torchaudio
6
  typeguard==4.4.0
@@ -15,3 +15,6 @@ transformers
15
  s3prl
16
  zhconv
17
  git+https://github.com/sea-turt1e/kanjiconv
 
 
 
 
1
  git+https://github.com/espnet/espnet
2
  espnet_model_zoo
3
+ pyopenjtalk
4
  datasets
5
  torchaudio
6
  typeguard==4.4.0
 
15
  s3prl
16
  zhconv
17
  git+https://github.com/sea-turt1e/kanjiconv
18
+ soundfile
19
+ PyYAML
20
+ gradio
tests/__init__.py ADDED
File without changes
tests/audio/chat.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:181a7f27f8acb00cba0276d0ff88759120a76eebd47b4e0a60c2424e43e5cbaf
3
+ size 271030
tests/audio/feeling.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fef036c2bf0ddf635a004845e94c89d0658f754a53e12fadbb50511d3cd6c15
3
+ size 263502
tests/audio/hello.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa7e839d32f7bda77cad11fc13fd1b92df939479612dd5af079d8f9b19598c0d
3
+ size 263502
tests/audio/interesting.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a1618f73d90ad068d5eb72455ac812b49fcb9e44e88af5e67ef88f5c6ddb74a
3
+ size 429086
tests/audio/music.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6388b587e282e8f6457b629b5cbb9fd50c5cb6a7f90c446329a3f23be8b1442c
3
+ size 286082
tests/audio/where_from.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ef81772b96813216d7b14d3d70a39b040e9c542d896d9337f8975f8fd6da96e
3
+ size 195766
tests/test_llm_infer.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modules.llm import get_llm_model
2
+
3
+ if __name__ == "__main__":
4
+ supported_llms = [
5
+ # "MiniMaxAI/MiniMax-M1-80k", #-》load with custom code
6
+ # "Qwen/Qwen-1_8B",
7
+ # "meta-llama/Llama-3.1-8B-Instruct", # pending for approval
8
+ # "tiiuae/Falcon-H1-1B-Base",
9
+ # "tiiuae/Falcon-H1-3B-Instruct",
10
+ # "tencent/Hunyuan-A13B-Instruct", # -> load with custom code
11
+ # "deepseek-ai/DeepSeek-R1-0528",
12
+ # "openai-community/gpt2-xl",
13
+ # "google/gemma-2-2b",
14
+ ]
15
+ for model_id in supported_llms:
16
+ try:
17
+ print(f"Loading model: {model_id}")
18
+ llm = get_llm_model(model_id, cache_dir="./.cache")
19
+ prompt = "你好,今天你心情怎么样?"
20
+ result = llm.generate(prompt)
21
+ print(f"=================")
22
+ print(f"[{model_id}] LLM inference result:", result)
23
+ except Exception as e:
24
+ print(f"Failed to load model {model_id}: {e}")
25
+ breakpoint()
26
+ continue