Spaces:

jhansss
/

SingingSDS

Running

App Files Files Community

Han Jionghao commited on Jul 2

Commit

be6640f

unverified ·

2 Parent(s): 629b906 be053b4

Merge branch 'refactor' into fwh_dev

Browse files

Files changed (34) hide show

.gitattributes +2 -0
README.md +144 -0
app.py +2 -1
characters/Limei.py +1 -1
characters/Yaoyin.py +1 -1
characters/base.py +1 -1
cli.py +52 -0
config/cli/limei_default.yaml +16 -0
config/cli/yaoyin_default.yaml +16 -0
config/cli/yaoyin_test.yaml +11 -0
config/{default.yaml → interface/default.yaml} +0 -0
config/{options.yaml → interface/options.yaml} +22 -18
evaluation/svs_eval.py +10 -13
interface.py +47 -32
modules/asr.py +6 -11
modules/llm/__init__.py +11 -0
modules/llm/base.py +15 -0
modules/{llm.py → llm/hf_pipeline.py} +10 -31
modules/llm/registry.py +19 -0
modules/melody.py +3 -2
modules/svs/base.py +2 -0
modules/svs/espnet.py +8 -8
modules/utils/g2p.py +1 -0
modules/utils/text_normalize.py +3 -2
pipeline.py +23 -11
requirements.txt +4 -1
tests/__init__.py +0 -0
tests/audio/chat.wav +3 -0
tests/audio/feeling.wav +3 -0
tests/audio/hello.wav +3 -0
tests/audio/interesting.wav +3 -0
tests/audio/music.wav +3 -0
tests/audio/where_from.wav +3 -0
tests/test_llm_infer.py +26 -0

.gitattributes CHANGED Viewed

@@ -1,2 +1,4 @@
 *.png filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text

 *.png filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,144 @@

+# SingingSDS: Role-Playing Singing Spoken Dialogue System
+A role-playing singing dialogue system that converts speech input into character-based singing output.
+## Installation
+### Requirements
+- Python 3.11+
+- CUDA (optional, for GPU acceleration)
+### Install Dependencies
+#### Option 1: Using Conda (Recommended)
+```bash
+conda create -n singingsds python=3.11
+conda activate singingsds
+conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
+pip install -r requirements.txt
+```
+#### Option 2: Using pip only
+```bash
+pip install -r requirements.txt
+```
+#### Option 3: Using pip with virtual environment
+```bash
+python -m venv singingsds_env
+# On Windows:
+singingsds_env\Scripts\activate
+# On macOS/Linux:
+source singingsds_env/bin/activate
+pip install -r requirements.txt
+```
+## Usage
+### Command Line Interface (CLI)
+#### Example Usage
+```bash
+python cli.py --query_audio tests/audio/hello.wav --config_path config/cli/yaoyin_default.yaml --output_audio outputs/yaoyin_hello.wav
+```
+#### Parameter Description
+- `--query_audio`: Input audio file path (required)
+- `--config_path`: Configuration file path (default: config/cli/yaoyin_default.yaml)
+- `--output_audio`: Output audio file path (required)
+### Web Interface (Gradio)
+Start the web interface:
+```bash
+python app.py
+```
+Then visit the displayed address in your browser to use the graphical interface.
+## Configuration
+### Character Configuration
+The system supports multiple preset characters:
+- **Yaoyin (遥音)**: Default timbre is `timbre2`
+- **Limei (丽梅)**: Default timbre is `timbre1`
+### Model Configuration
+#### ASR Models
+- `openai/whisper-large-v3-turbo`
+- `openai/whisper-large-v3`
+- `openai/whisper-medium`
+- `sanchit-gandhi/whisper-small-dv`
+- `facebook/wav2vec2-base-960h`
+#### LLM Models
+- `google/gemma-2-2b`
+- `MiniMaxAI/MiniMax-M1-80k`
+- `meta-llama/Llama-3.2-3B-Instruct`
+#### SVS Models
+- `espnet/mixdata_svs_visinger2_spkemb_lang_pretrained` (Bilingual)
+- `espnet/aceopencpop_svs_visinger2_40singer_pretrain` (Chinese)
+## Project Structure
+```
+SingingSDS/
+├── cli.py                 # Command line interface
+├── interface.py           # Gradio interface
+├── pipeline.py            # Core processing pipeline
+├── app.py                 # Web application entry
+├── requirements.txt       # Python dependencies
+├── config/                # Configuration files
+│   ├── cli/               # CLI-specific configuration
+│   └── interface/         # Interface-specific configuration
+├── modules/               # Core modules
+│   ├── asr.py            # Speech recognition module
+│   ├── llm.py            # Large language model module
+│   ├── melody.py         # Melody control module
+│   ├── svs/              # Singing voice synthesis modules
+│   │   ├── base.py       # Base SVS class
+│   │   ├── espnet.py     # ESPnet SVS implementation
+│   │   ├── registry.py   # SVS model registry
+│   │   └── __init__.py   # SVS module initialization
+│   └── utils/            # Utility modules
+│       ├── g2p.py        # Grapheme-to-phoneme conversion
+│       ├── text_normalize.py # Text normalization
+│       └── resources/    # Utility resources
+├── characters/            # Character definitions
+│   ├── base.py           # Base character class
+│   ├── Limei.py          # Limei character definition
+│   ├── Yaoyin.py         # Yaoyin character definition
+│   └── __init__.py       # Character module initialization
+├── evaluation/            # Evaluation modules
+│   └── svs_eval.py       # SVS evaluation metrics
+├── data/                  # Data directory
+│   ├── kising/           # Kising dataset
+│   └── touhou/           # Touhou dataset
+├── resources/             # Project resources
+├── data_handlers/         # Data handling utilities
+├── assets/                # Static assets
+└── tests/                 # Test files
+```
+## Contributing
+Issues and Pull Requests are welcome!
+## License

app.py CHANGED Viewed

@@ -3,7 +3,8 @@ from interface import GradioInterface
 def main():
     demo = GradioInterface(
-        options_config="config/options.yaml", default_config="config/default.yaml"
     ).create_interface()
     demo.launch()

 def main():
     demo = GradioInterface(
+        options_config="config/interface/options.yaml",
+        default_config="config/interface/default.yaml",
     ).create_interface()
     demo.launch()

characters/Limei.py CHANGED Viewed

@@ -5,7 +5,7 @@ def get_character():
     return Character(
         name="Limei (丽梅)",
         image_path="assets/character_limei.png",
-        default_timbre="timbre1",
         prompt="""你是丽梅（Limei），来自幻想世界"长歌原"的角色，一个以歌声传承记忆的世界。
 你是灵响界山林音乐之城"莲鸣"的现任守护者，十九岁的公主殿下，肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言，由历任守护者续写。

     return Character(
         name="Limei (丽梅)",
         image_path="assets/character_limei.png",
+        default_voice="voice1",
         prompt="""你是丽梅（Limei），来自幻想世界"长歌原"的角色，一个以歌声传承记忆的世界。
 你是灵响界山林音乐之城"莲鸣"的现任守护者，十九岁的公主殿下，肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言，由历任守护者续写。

characters/Yaoyin.py CHANGED Viewed

@@ -5,7 +5,7 @@ def get_character():
     return Character(
         name="Yaoyin (遥音)",
         image_path="assets/character_yaoyin.jpg",
-        default_timbre="timbre2",
         prompt="""你是遥音（Yaoyin），来自幻想世界"长歌原"的角色，一个以歌声传承记忆的世界。
 你是游历四方的歌者与吟游诗人，出生于鹿鸣山·云歌村，常年行走各地，采集歌谣与故事。

     return Character(
         name="Yaoyin (遥音)",
         image_path="assets/character_yaoyin.jpg",
+        default_voice="voice2",
         prompt="""你是遥音（Yaoyin），来自幻想世界"长歌原"的角色，一个以歌声传承记忆的世界。
 你是游历四方的歌者与吟游诗人，出生于鹿鸣山·云歌村，常年行走各地，采集歌谣与故事。

characters/base.py CHANGED Viewed

@@ -5,5 +5,5 @@ from dataclasses import dataclass
 class Character:
     name: str
     image_path: str
-    default_timbre: str
     prompt: str

 class Character:
     name: str
     image_path: str
+    default_voice: str
     prompt: str

cli.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from argparse import ArgumentParser
+from logging import getLogger
+from pathlib import Path
+import yaml
+from characters import CHARACTERS
+from pipeline import SingingDialoguePipeline
+logger = getLogger(__name__)
+def get_parser():
+    parser = ArgumentParser()
+    parser.add_argument("--query_audio", type=Path, required=True)
+    parser.add_argument(
+        "--config_path", type=Path, default="config/cli/yaoyin_default.yaml"
+    )
+    parser.add_argument("--output_audio", type=Path, required=True)
+    return parser
+def load_config(config_path: Path):
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+    return config
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    config = load_config(args.config_path)
+    pipeline = SingingDialoguePipeline(config)
+    speaker = config["speaker"]
+    language = config["language"]
+    character_name = config["prompt_template_character"]
+    character = CHARACTERS[character_name]
+    prompt_template = character.prompt
+    results = pipeline.run(
+        args.query_audio,
+        language,
+        prompt_template,
+        speaker,
+        output_audio_path=args.output_audio,
+    )
+    logger.info(
+        f"Input: {args.query_audio}, Output: {args.output_audio}, ASR results: {results['asr_text']}, LLM results: {results['llm_text']}"
+    )
+if __name__ == "__main__":
+    main()

config/cli/limei_default.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+asr_model: openai/whisper-large-v3-turbo
+llm_model: google/gemma-2-2b
+svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
+melody_source: sample-lyric-kising
+language: mandarin
+prompt_template_character: Limei
+speaker: 5
+cache_dir: .cache
+track_latency: True
+evaluators:
+  svs:
+    - singmos
+    - per
+    - melody
+    - aesthetic

config/cli/yaoyin_default.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+asr_model: openai/whisper-large-v3-turbo
+llm_model: google/gemma-2-2b
+svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
+melody_source: sample-lyric-kising
+language: mandarin
+prompt_template_character: Yaoyin
+speaker: 9
+cache_dir: .cache
+track_latency: True
+evaluators:
+  svs:
+    - singmos
+    - per
+    - melody
+    - aesthetic

config/cli/yaoyin_test.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+asr_model: openai/whisper-small
+llm_model: google/gemma-2-2b
+svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
+melody_source: sample-lyric-kising
+language: mandarin
+max_sentences: 1
+prompt_template_character: Yaoyin
+speaker: 9
+cache_dir: .cache
+track_latency: True

config/{default.yaml → interface/default.yaml} RENAMED Viewed

File without changes

config/{options.yaml → interface/options.yaml} RENAMED Viewed

@@ -5,6 +5,8 @@ asr_models:
     name: Whisper large-v3
   - id: openai/whisper-medium
     name: Whisper medium
   - id: sanchit-gandhi/whisper-small-dv
     name: Whisper small-dv
   - id: facebook/wav2vec2-base-960h
@@ -15,38 +17,40 @@ llm_models:
     name: Gemma 2 2B
   - id: MiniMaxAI/MiniMax-M1-80k
     name: MiniMax M1 80k
 svs_models:
   - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     name: Visinger2 (Bilingual)-zh
     model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     lang: mandarin
-    embeddings:
-      timbre1: resource/singer/singer_embedding_ace-2.npy
-      timbre2: resource/singer/singer_embedding_ace-8.npy
-      timbre3: resource/singer/singer_embedding_itako.npy
-      timbre4: resource/singer/singer_embedding_kising_orange.npy
-      timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
   - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     name: Visinger2 (Bilingual)-jp
     model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     lang: japanese
-    embeddings:
-      timbre1: resource/singer/singer_embedding_ace-2.npy
-      timbre2: resource/singer/singer_embedding_ace-8.npy
-      timbre3: resource/singer/singer_embedding_itako.npy
-      timbre4: resource/singer/singer_embedding_kising_orange.npy
-      timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
   - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
     name: Visinger2 (Chinese)
     model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
     lang: mandarin
-    embeddings:
-      timbre1: 5
-      timbre2: 8
-      timbre3: 12
-      timbre4: 15
-      timbre5: 29
 melody_sources:
   - id: gen-random-none

     name: Whisper large-v3
   - id: openai/whisper-medium
     name: Whisper medium
+  - id: openai/whisper-small
+    name: Whisper small
   - id: sanchit-gandhi/whisper-small-dv
     name: Whisper small-dv
   - id: facebook/wav2vec2-base-960h
     name: Gemma 2 2B
   - id: MiniMaxAI/MiniMax-M1-80k
     name: MiniMax M1 80k
+  - id: meta-llama/Llama-3.2-3B-Instruct
+    name: Llama 3.2 3B Instruct
 svs_models:
   - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     name: Visinger2 (Bilingual)-zh
     model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     lang: mandarin
+    voices:
+      voice1: resource/singer/singer_embedding_ace-2.npy
+      voice2: resource/singer/singer_embedding_ace-8.npy
+      voice3: resource/singer/singer_embedding_itako.npy
+      voice4: resource/singer/singer_embedding_kising_orange.npy
+      voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
   - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     name: Visinger2 (Bilingual)-jp
     model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     lang: japanese
+    voices:
+      voice1: resource/singer/singer_embedding_ace-2.npy
+      voice2: resource/singer/singer_embedding_ace-8.npy
+      voice3: resource/singer/singer_embedding_itako.npy
+      voice4: resource/singer/singer_embedding_kising_orange.npy
+      voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
   - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
     name: Visinger2 (Chinese)
     model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
     lang: mandarin
+    voices:
+      voice1: 5
+      voice2: 8
+      voice3: 12
+      voice4: 15
+      voice5: 29
 melody_sources:
   - id: gen-random-none

evaluation/svs_eval.py CHANGED Viewed

@@ -37,7 +37,8 @@ def init_audiobox_aesthetics():
 # ----------- Evaluation -----------
-def eval_singmos(audio_array, sr, predictor):
     wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
     wav_tensor = torch.from_numpy(wav).unsqueeze(0)
     length_tensor = torch.tensor([wav_tensor.shape[1]])
@@ -71,7 +72,8 @@ def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
     return np.mean(dissonant) if intervals else np.nan
-def eval_per(audio_array, sr, model=None):
     # TODO: implement PER evaluation
     return {}
@@ -97,20 +99,16 @@ def load_evaluators(config):
     return loaded
-def run_evaluation(audio_array, sr, evaluators):
     results = {}
     if "singmos" in evaluators:
-        results.update(eval_singmos(audio_array, sr, evaluators["singmos"]))
     if "per" in evaluators:
-        results.update(eval_per(audio_array, sr, evaluators["per"]))
-    # create a tmp file with unique name
-    tmp_path = Path(".tmp") / f"{uuid.uuid4()}.wav"
-    sf.write(tmp_path, audio_array, sr)
     if "melody" in evaluators:
-        results.update(eval_melody_metrics(tmp_path, evaluators["melody"]))
     if "aesthetic" in evaluators:
-        results.update(eval_aesthetic(tmp_path, evaluators["aesthetic"]))
-    tmp_path.unlink()
     return results
@@ -122,9 +120,8 @@ if __name__ == "__main__":
     parser.add_argument("--results_csv", type=str, required=True)
     parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
     args = parser.parse_args()
-    audio_array, sr = librosa.load(args.wav_path, sr=None)
     evaluators = load_evaluators(args.evaluators.split(","))
-    results = run_evaluation(audio_array, sr, evaluators)
     print(results)
     with open(args.results_csv, "a") as f:

 # ----------- Evaluation -----------
+def eval_singmos(audio_path, predictor):
+    audio_array, sr = librosa.load(audio_path, sr=44100)
     wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
     wav_tensor = torch.from_numpy(wav).unsqueeze(0)
     length_tensor = torch.tensor([wav_tensor.shape[1]])
     return np.mean(dissonant) if intervals else np.nan
+def eval_per(audio_path, model=None):
+    audio_array, sr = librosa.load(audio_path, sr=16000)
     # TODO: implement PER evaluation
     return {}
     return loaded
+def run_evaluation(audio_path, evaluators):
     results = {}
     if "singmos" in evaluators:
+        results.update(eval_singmos(audio_path, evaluators["singmos"]))
     if "per" in evaluators:
+        results.update(eval_per(audio_path, evaluators["per"]))
     if "melody" in evaluators:
+        results.update(eval_melody_metrics(audio_path, evaluators["melody"]))
     if "aesthetic" in evaluators:
+        results.update(eval_aesthetic(audio_path, evaluators["aesthetic"]))
     return results
     parser.add_argument("--results_csv", type=str, required=True)
     parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
     args = parser.parse_args()
     evaluators = load_evaluators(args.evaluators.split(","))
+    results = run_evaluation(args.wav_path, evaluators)
     print(results)
     with open(args.results_csv, "a") as f:

interface.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import gradio as gr
 import yaml
@@ -17,8 +20,8 @@ class GradioInterface:
         self.current_svs_model = (
             f"{self.default_config['language']}-{self.default_config['svs_model']}"
         )
-        self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
-            self.character_info[self.current_character].default_timbre
         ]
         self.pipeline = SingingDialoguePipeline(self.default_config)
@@ -104,21 +107,21 @@ class GradioInterface:
                                 value=self.current_svs_model,
                             )
                         with gr.Row():
-                            timbre_radio = gr.Radio(
-                                label="Singing Timbre",
                                 choices=list(
                                     self.svs_model_map[self.current_svs_model][
-                                        "embeddings"
                                     ].keys()
                                 ),
                                 value=self.character_info[
                                     self.current_character
-                                ].default_timbre,
                             )
                 character_radio.change(
                     fn=self.update_character,
                     inputs=character_radio,
-                    outputs=[character_image, timbre_radio],
                 )
                 asr_radio.change(
                     fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
@@ -129,35 +132,41 @@ class GradioInterface:
                 svs_radio.change(
                     fn=self.update_svs_model,
                     inputs=svs_radio,
-                    outputs=[svs_radio, timbre_radio],
                 )
                 melody_radio.change(
                     fn=self.update_melody_source,
                     inputs=melody_radio,
                     outputs=melody_radio,
                 )
-                timbre_radio.change(
-                    fn=self.update_timbre, inputs=timbre_radio, outputs=timbre_radio
                 )
                 mic_input.change(
                     fn=self.run_pipeline,
                     inputs=mic_input,
                     outputs=[interaction_log, audio_output],
                 )
             return demo
         except Exception as e:
             print(f"error: {e}")
             breakpoint()
     def update_character(self, character):
         self.current_character = character
-        character_timbre = self.character_info[self.current_character].default_timbre
-        self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
-            character_timbre
         ]
         return gr.update(value=self.character_info[character].image_path), gr.update(
-            value=character_timbre
         )
     def update_asr_model(self, asr_model):
@@ -170,23 +179,23 @@ class GradioInterface:
     def update_svs_model(self, svs_model):
         self.current_svs_model = svs_model
-        character_timbre = self.character_info[self.current_character].default_timbre
-        self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
-            character_timbre
         ]
         self.pipeline.set_svs_model(
             self.svs_model_map[self.current_svs_model]["model_path"]
         )
         print(
-            f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and timbre_radio to {character_timbre}"
         )
         return (
             gr.update(value=svs_model),
             gr.update(
                 choices=list(
-                    self.svs_model_map[self.current_svs_model]["embeddings"].keys()
                 ),
-                value=character_timbre,
             ),
         )
@@ -194,24 +203,30 @@ class GradioInterface:
         self.current_melody_source = melody_source
         return gr.update(value=self.current_melody_source)
-    def update_timbre(self, timbre):
-        self.current_timbre = self.svs_model_map[self.current_svs_model]["embeddings"][
-            timbre
-        ]
-        return gr.update(value=timbre)
     def run_pipeline(self, audio_path):
         results = self.pipeline.run(
             audio_path,
             self.svs_model_map[self.current_svs_model]["lang"],
             self.character_info[self.current_character].prompt,
-            svs_inference_kwargs={
-                "speaker": self.current_timbre,
-            },
-            max_new_tokens=100,
         )
         formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
-        return gr.update(value=formatted_logs), gr.update(value=results["svs_audio"])
-    def run_evaluation(self, audio, audio_sample_rate):
-        pass

+import time
+import uuid
 import gradio as gr
 import yaml
         self.current_svs_model = (
             f"{self.default_config['language']}-{self.default_config['svs_model']}"
         )
+        self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
+            self.character_info[self.current_character].default_voice
         ]
         self.pipeline = SingingDialoguePipeline(self.default_config)
                                 value=self.current_svs_model,
                             )
                         with gr.Row():
+                            voice_radio = gr.Radio(
+                                label="Singing voice",
                                 choices=list(
                                     self.svs_model_map[self.current_svs_model][
+                                        "voices"
                                     ].keys()
                                 ),
                                 value=self.character_info[
                                     self.current_character
+                                ].default_voice,
                             )
                 character_radio.change(
                     fn=self.update_character,
                     inputs=character_radio,
+                    outputs=[character_image, voice_radio],
                 )
                 asr_radio.change(
                     fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
                 svs_radio.change(
                     fn=self.update_svs_model,
                     inputs=svs_radio,
+                    outputs=[svs_radio, voice_radio],
                 )
                 melody_radio.change(
                     fn=self.update_melody_source,
                     inputs=melody_radio,
                     outputs=melody_radio,
                 )
+                voice_radio.change(
+                    fn=self.update_voice, inputs=voice_radio, outputs=voice_radio
                 )
                 mic_input.change(
                     fn=self.run_pipeline,
                     inputs=mic_input,
                     outputs=[interaction_log, audio_output],
                 )
+                metrics_button.click(
+                    fn=self.update_metrics,
+                    inputs=audio_output,
+                    outputs=[metrics_output],
+                )
             return demo
         except Exception as e:
             print(f"error: {e}")
             breakpoint()
+            return gr.Blocks()
     def update_character(self, character):
         self.current_character = character
+        character_voice = self.character_info[self.current_character].default_voice
+        self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
+            character_voice
         ]
         return gr.update(value=self.character_info[character].image_path), gr.update(
+            value=character_voice
         )
     def update_asr_model(self, asr_model):
     def update_svs_model(self, svs_model):
         self.current_svs_model = svs_model
+        character_voice = self.character_info[self.current_character].default_voice
+        self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
+            character_voice
         ]
         self.pipeline.set_svs_model(
             self.svs_model_map[self.current_svs_model]["model_path"]
         )
         print(
+            f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and voice_radio to {character_voice}"
         )
         return (
             gr.update(value=svs_model),
             gr.update(
                 choices=list(
+                    self.svs_model_map[self.current_svs_model]["voices"].keys()
                 ),
+                value=character_voice,
             ),
         )
         self.current_melody_source = melody_source
         return gr.update(value=self.current_melody_source)
+    def update_voice(self, voice):
+        self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][voice]
+        return gr.update(value=voice)
     def run_pipeline(self, audio_path):
+        if not audio_path:
+            return gr.update(value=""), gr.update(value="")
+        tmp_file = f"audio_{int(time.time())}_{uuid.uuid4().hex[:8]}.wav"
         results = self.pipeline.run(
             audio_path,
             self.svs_model_map[self.current_svs_model]["lang"],
             self.character_info[self.current_character].prompt,
+            self.current_voice,
+            output_audio_path=tmp_file,
+            max_new_tokens=50,
         )
         formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
+        return gr.update(value=formatted_logs), gr.update(
+            value=results["output_audio_path"]
+        )
+    def update_metrics(self, audio_path):
+        if not audio_path:
+            return gr.update(value="")
+        results = self.pipeline.evaluate(audio_path)
+        formatted_metrics = "\n".join([f"{k}: {v}" for k, v in results.items()])
+        return gr.update(value=formatted_metrics)

modules/asr.py CHANGED Viewed

@@ -1,5 +1,4 @@
-from __future__ import annotations
 from abc import ABC, abstractmethod
 import librosa
@@ -7,17 +6,17 @@ import numpy as np
 from transformers import pipeline
 ASR_MODEL_REGISTRY = {}
 class AbstractASRModel(ABC):
-    @abstractmethod
     def __init__(
         self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
     ):
         self.model_id = model_id
         self.device = device
         self.cache_dir = cache_dir
-        pass
     @abstractmethod
     def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
@@ -52,15 +51,11 @@ class WhisperASR(AbstractASRModel):
             "automatic-speech-recognition",
             model=model_id,
             device=0 if device == "cuda" else -1,
             **kwargs,
         )
     def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
         if audio_sample_rate != 16000:
-            try:
-                audio, _ = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
-            except Exception as e:
-                breakpoint()
-                print(f"Error resampling audio: {e}")
-                audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
-        return self.pipe(audio, generate_kwargs={"language": language}).get("text", "")

+import os
 from abc import ABC, abstractmethod
 import librosa
 from transformers import pipeline
 ASR_MODEL_REGISTRY = {}
+hf_token = os.getenv("HF_TOKEN")
 class AbstractASRModel(ABC):
     def __init__(
         self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
     ):
+        print(f"Loading ASR model {model_id}...")
         self.model_id = model_id
         self.device = device
         self.cache_dir = cache_dir
     @abstractmethod
     def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
             "automatic-speech-recognition",
             model=model_id,
             device=0 if device == "cuda" else -1,
+            token=hf_token,
             **kwargs,
         )
     def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
         if audio_sample_rate != 16000:
+            audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
+        return self.pipe(audio, generate_kwargs={"language": language}, return_timestamps=False).get("text", "")

modules/llm/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .base import AbstractLLMModel
+from .registry import LLM_MODEL_REGISTRY, get_llm_model, register_llm_model
+from .hf_pipeline import HFTextGenerationLLM
+from .qwen import QwenLLM
+__all__ = [
+    "AbstractLLMModel",
+    "get_llm_model",
+    "register_llm_model",
+    "LLM_MODEL_REGISTRY",
+]

modules/llm/base.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from abc import ABC, abstractmethod
+class AbstractLLMModel(ABC):
+    def __init__(
+        self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
+    ):
+        print(f"Loading LLM model {model_id}...")
+        self.model_id = model_id
+        self.device = device
+        self.cache_dir = cache_dir
+    @abstractmethod
+    def generate(self, prompt: str, **kwargs) -> str:
+        pass

modules/{llm.py → llm/hf_pipeline.py} RENAMED Viewed

@@ -1,44 +1,21 @@
-from abc import ABC, abstractmethod
 from transformers import pipeline
-LLM_MODEL_REGISTRY = {}
-class AbstractLLMModel(ABC):
-    @abstractmethod
-    def __init__(
-        self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
-    ): ...
-    @abstractmethod
-    def generate(self, prompt: str, **kwargs) -> str:
-        pass
-def register_llm_model(prefix: str):
-    def wrapper(cls):
-        assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel"
-        LLM_MODEL_REGISTRY[prefix] = cls
-        return cls
-    return wrapper
-def get_llm_model(model_id: str, device="cpu", **kwargs) -> AbstractLLMModel:
-    for prefix, cls in LLM_MODEL_REGISTRY.items():
-        if model_id.startswith(prefix):
-            return cls(model_id, device=device, **kwargs)
-    raise ValueError(f"No LLM wrapper found for model: {model_id}")
-@register_llm_model("google/gemma")
-@register_llm_model("tii/")  # e.g., Falcon
-@register_llm_model("meta-llama")
 class HFTextGenerationLLM(AbstractLLMModel):
     def __init__(
         self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
     ):
         model_kwargs = kwargs.setdefault("model_kwargs", {})
         model_kwargs["cache_dir"] = cache_dir
         self.pipe = pipeline(
@@ -46,6 +23,8 @@ class HFTextGenerationLLM(AbstractLLMModel):
             model=model_id,
             device=0 if device == "cuda" else -1,
             return_full_text=False,
             **kwargs,
         )

+import os
 from transformers import pipeline
+from .base import AbstractLLMModel
+from .registry import register_llm_model
+hf_token = os.getenv("HF_TOKEN")
+@register_llm_model("openai-community/")
+@register_llm_model("google/gemma-")
+@register_llm_model("meta-llama/Llama-")
 class HFTextGenerationLLM(AbstractLLMModel):
     def __init__(
         self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
     ):
+        super().__init__(model_id, device, cache_dir, **kwargs)
         model_kwargs = kwargs.setdefault("model_kwargs", {})
         model_kwargs["cache_dir"] = cache_dir
         self.pipe = pipeline(
             model=model_id,
             device=0 if device == "cuda" else -1,
             return_full_text=False,
+            token=hf_token,
+            trust_remote_code=True,
             **kwargs,
         )

modules/llm/registry.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .base import AbstractLLMModel
+LLM_MODEL_REGISTRY = {}
+def register_llm_model(prefix: str):
+    def wrapper(cls):
+        assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel"
+        LLM_MODEL_REGISTRY[prefix] = cls
+        return cls
+    return wrapper
+def get_llm_model(model_id: str, device="cpu", **kwargs) -> AbstractLLMModel:
+    for prefix, cls in LLM_MODEL_REGISTRY.items():
+        if model_id.startswith(prefix):
+            return cls(model_id, device=device, **kwargs)
+    raise ValueError(f"No LLM wrapper found for model: {model_id}")

modules/melody.py CHANGED Viewed

@@ -109,9 +109,10 @@ class MelodyController:
                 if pitch == 0:
                     score.append((st, ed, ref_lyric, pitch))
                 elif ref_lyric in ["-", "——"] and align_type == "lyric":
-                    score.append((st, ed, ref_lyric, pitch))
-                    text_idx += 1
                 else:
                     score.append((st, ed, text_list[text_idx], pitch))
                     text_idx += 1
         return score

                 if pitch == 0:
                     score.append((st, ed, ref_lyric, pitch))
                 elif ref_lyric in ["-", "——"] and align_type == "lyric":
+                    score.append((st, ed, "-", pitch))
                 else:
                     score.append((st, ed, text_list[text_idx], pitch))
                     text_idx += 1
+                    if text_idx >= len(text_list):
+                        break
         return score

modules/svs/base.py CHANGED Viewed

@@ -13,6 +13,8 @@ class AbstractSVSModel(ABC):
     def synthesize(
         self,
         score: list[tuple[float, float, str, int]],
         **kwargs,
     ) -> tuple[np.ndarray, int]:
         """

     def synthesize(
         self,
         score: list[tuple[float, float, str, int]],
+        language: str,
+        speaker: str,
         **kwargs,
     ) -> tuple[np.ndarray, int]:
         """

modules/svs/espnet.py CHANGED Viewed

@@ -53,7 +53,7 @@ class ESPNetSVS(AbstractSVSModel):
             phoneme_mappers = {}
         return phoneme_mappers
-    def _preprocess(self, score: list[tuple[float, float, str, int]], language: str):
         if language not in self.phoneme_mappers:
             raise ValueError(f"Unsupported language: {language} for {self.model_id}")
         phoneme_mapper = self.phoneme_mappers[language]
@@ -90,20 +90,20 @@ class ESPNetSVS(AbstractSVSModel):
             pre_phn = phn_units[-1]
         batch = {
-            "score": {
-                "tempo": 120,  # does not affect svs result, as note durations are in time unit
-                "notes": notes,
-            },
             "text": " ".join(phns),
         }
         return batch
     def synthesize(
-        self, score: list[tuple[float, float, str, int]], language: str, **kwargs
     ):
         batch = self._preprocess(score, language)
         if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
-            sid = np.array([int(kwargs["speaker"])])
             output_dict = self.model(batch, sids=sid)
         elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
             langs = {
@@ -115,7 +115,7 @@ class ESPNetSVS(AbstractSVSModel):
                     f"Unsupported language: {language} for {self.model_id}"
                 )
             lid = np.array([langs[language]])
-            spk_embed = np.load(kwargs["speaker"])
             output_dict = self.model(batch, lids=lid, spembs=spk_embed)
         else:
             raise NotImplementedError(f"Model {self.model_id} not supported")

             phoneme_mappers = {}
         return phoneme_mappers
+    def _preprocess(self, score: list[tuple[float, float, str, int] | tuple[float, float, str, float]], language: str):
         if language not in self.phoneme_mappers:
             raise ValueError(f"Unsupported language: {language} for {self.model_id}")
         phoneme_mapper = self.phoneme_mappers[language]
             pre_phn = phn_units[-1]
         batch = {
+            "score": (
+                120,  # does not affect svs result, as note durations are in time unit
+                notes,
+            ),
             "text": " ".join(phns),
         }
         return batch
     def synthesize(
+        self, score: list[tuple[float, float, str, float] | tuple[float, float, str, int]], language: str, speaker: str, **kwargs
     ):
         batch = self._preprocess(score, language)
         if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
+            sid = np.array([int(speaker)])
             output_dict = self.model(batch, sids=sid)
         elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
             langs = {
                     f"Unsupported language: {language} for {self.model_id}"
                 )
             lid = np.array([langs[language]])
+            spk_embed = np.load(speaker)
             output_dict = self.model(batch, lids=lid, spembs=spk_embed)
         else:
             raise NotImplementedError(f"Model {self.model_id} not supported")

modules/utils/g2p.py CHANGED Viewed

@@ -32,6 +32,7 @@ for plan in ace_phonemes_all_plans["plans"]:
 def preprocess_text(text: str, language: str) -> list[str]:
     if language == "mandarin":
         text_list = to_pinyin(text)
     elif language == "japanese":

 def preprocess_text(text: str, language: str) -> list[str]:
+    text = text.replace(" ", "")
     if language == "mandarin":
         text_list = to_pinyin(text)
     elif language == "japanese":

modules/utils/text_normalize.py CHANGED Viewed

@@ -3,12 +3,13 @@ from typing import Optional
 def remove_non_zh_jp(text: str) -> str:
-    pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef]"
     return re.sub(pattern, "", text)
 def truncate_sentences(text: str, max_sentences: int) -> str:
-    sentences = re.split(r"(?<=[。！？])", text)
     return "".join(sentences[:max_sentences]).strip()

 def remove_non_zh_jp(text: str) -> str:
+    pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef\s]"
     return re.sub(pattern, "", text)
 def truncate_sentences(text: str, max_sentences: int) -> str:
+    sentences = re.split(r"(?<=[。！？!?~])|(?:\n+)|(?: {2,})", text)
+    sentences = [s.strip() for s in sentences if s.strip()]
     return "".join(sentences[:max_sentences]).strip()

pipeline.py CHANGED Viewed

@@ -1,6 +1,11 @@
-import torch
 import time
 import librosa
 from modules.asr import get_asr_model
 from modules.llm import get_llm_model
@@ -29,6 +34,7 @@ class SingingDialoguePipeline:
         self.melody_controller = MelodyController(
             config["melody_source"], self.cache_dir
         )
         self.track_latency = config.get("track_latency", False)
         self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
@@ -55,8 +61,9 @@ class SingingDialoguePipeline:
         audio_path,
         language,
         prompt_template,
-        svs_inference_kwargs,
-        max_new_tokens=100,
     ):
         if self.track_latency:
             asr_start_time = time.time()
@@ -75,13 +82,14 @@ class SingingDialoguePipeline:
         if self.track_latency:
             llm_end_time = time.time()
             llm_latency = llm_end_time - llm_start_time
-        print(f"llm output: {output}确认一下是不是不含prompt的")
-        llm_response = clean_llm_output(output, language=language)
         score = self.melody_controller.generate_score(llm_response, language)
         if self.track_latency:
             svs_start_time = time.time()
         singing_audio, sample_rate = self.svs.synthesize(
-            score, language=language, **svs_inference_kwargs
         )
         if self.track_latency:
             svs_end_time = time.time()
@@ -89,15 +97,19 @@ class SingingDialoguePipeline:
         results = {
             "asr_text": asr_result,
             "llm_text": llm_response,
-            "svs_audio": (singing_audio, sample_rate),
         }
         if self.track_latency:
-            results["metrics"].update({
                 "asr_latency": asr_latency,
                 "llm_latency": llm_latency,
                 "svs_latency": svs_latency,
-            })
         return results
-    def evaluate(self, audio, sample_rate):
-        return run_evaluation(audio, sample_rate, self.evaluators)

+from __future__ import annotations
 import time
+from pathlib import Path
 import librosa
+import soundfile as sf
+import torch
 from modules.asr import get_asr_model
 from modules.llm import get_llm_model
         self.melody_controller = MelodyController(
             config["melody_source"], self.cache_dir
         )
+        self.max_sentences = config.get("max_sentences", 2)
         self.track_latency = config.get("track_latency", False)
         self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
         audio_path,
         language,
         prompt_template,
+        speaker,
+        output_audio_path: Path | str = None,
+        max_new_tokens=50,
     ):
         if self.track_latency:
             asr_start_time = time.time()
         if self.track_latency:
             llm_end_time = time.time()
             llm_latency = llm_end_time - llm_start_time
+        llm_response = clean_llm_output(
+            output, language=language, max_sentences=self.max_sentences
+        )
         score = self.melody_controller.generate_score(llm_response, language)
         if self.track_latency:
             svs_start_time = time.time()
         singing_audio, sample_rate = self.svs.synthesize(
+            score, language=language, speaker=speaker
         )
         if self.track_latency:
             svs_end_time = time.time()
         results = {
             "asr_text": asr_result,
             "llm_text": llm_response,
+            "svs_audio": (sample_rate, singing_audio),
         }
+        if output_audio_path:
+            Path(output_audio_path).parent.mkdir(parents=True, exist_ok=True)
+            sf.write(output_audio_path, singing_audio, sample_rate)
+            results["output_audio_path"] = output_audio_path
         if self.track_latency:
+            results["metrics"] = {
                 "asr_latency": asr_latency,
                 "llm_latency": llm_latency,
                 "svs_latency": svs_latency,
+            }
         return results
+    def evaluate(self, audio_path):
+        return run_evaluation(audio_path, self.evaluators)

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 git+https://github.com/espnet/espnet
 espnet_model_zoo
-# pyopenjtalk
 datasets
 torchaudio
 typeguard==4.4.0
@@ -15,3 +15,6 @@ transformers
 s3prl
 zhconv
 git+https://github.com/sea-turt1e/kanjiconv

 git+https://github.com/espnet/espnet
 espnet_model_zoo
+pyopenjtalk
 datasets
 torchaudio
 typeguard==4.4.0
 s3prl
 zhconv
 git+https://github.com/sea-turt1e/kanjiconv
+soundfile
+PyYAML
+gradio

tests/__init__.py ADDED Viewed

File without changes

tests/audio/chat.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:181a7f27f8acb00cba0276d0ff88759120a76eebd47b4e0a60c2424e43e5cbaf
+size 271030

tests/audio/feeling.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fef036c2bf0ddf635a004845e94c89d0658f754a53e12fadbb50511d3cd6c15
+size 263502

tests/audio/hello.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa7e839d32f7bda77cad11fc13fd1b92df939479612dd5af079d8f9b19598c0d
+size 263502

tests/audio/interesting.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a1618f73d90ad068d5eb72455ac812b49fcb9e44e88af5e67ef88f5c6ddb74a
+size 429086

tests/audio/music.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6388b587e282e8f6457b629b5cbb9fd50c5cb6a7f90c446329a3f23be8b1442c
+size 286082

tests/audio/where_from.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ef81772b96813216d7b14d3d70a39b040e9c542d896d9337f8975f8fd6da96e
+size 195766

tests/test_llm_infer.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from modules.llm import get_llm_model
+if __name__ == "__main__":
+    supported_llms = [
+        # "MiniMaxAI/MiniMax-M1-80k", #-》load with custom code
+        # "Qwen/Qwen-1_8B",
+        # "meta-llama/Llama-3.1-8B-Instruct", # pending for approval
+        # "tiiuae/Falcon-H1-1B-Base",
+        # "tiiuae/Falcon-H1-3B-Instruct",
+        # "tencent/Hunyuan-A13B-Instruct", # -> load with custom code
+        # "deepseek-ai/DeepSeek-R1-0528",
+        # "openai-community/gpt2-xl",
+        # "google/gemma-2-2b",
+    ]
+    for model_id in supported_llms:
+        try:
+            print(f"Loading model: {model_id}")
+            llm = get_llm_model(model_id, cache_dir="./.cache")
+            prompt = "你好，今天你心情怎么样？"
+            result = llm.generate(prompt)
+            print(f"=================")
+            print(f"[{model_id}] LLM inference result:", result)
+        except Exception as e:
+            print(f"Failed to load model {model_id}: {e}")
+            breakpoint()
+            continue