Spaces:

jhansss
/

SingingSDS

Sleeping

App Files Files Community

jhansss commited on Jul 3

Commit

69c405f

2 Parent(s): e642717 c91af6d

Merge branch 'refactor' into hf

Browse files

Files changed (43) hide show

README.md +144 -0
assets/{character_yaoyin.jpg → character_yaoyin.png} +2 -2
characters/Limei.py +2 -10
characters/Yaoyin.py +2 -10
characters/__init__.py +4 -0
cli.py +16 -10
config/cli/limei_default.yaml +1 -1
config/cli/yaoyin_default.yaml +1 -1
config/cli/yaoyin_test.yaml +11 -0
config/interface/default.yaml +2 -2
config/interface/options.yaml +27 -16
config/options.yaml +0 -65
data/genre/word_data_en.json +0 -0
data/genre/word_data_zh.json +0 -0
data_handlers/genre.py +50 -0
evaluation/svs_eval.py +2 -2
interface.py +10 -8
modules/asr.py +2 -7
modules/asr/__init__.py +11 -0
modules/asr/base.py +24 -0
modules/asr/paraformer.py +86 -0
modules/asr/registry.py +19 -0
modules/asr/whisper.py +82 -0
modules/llm.py +0 -61
modules/llm/__init__.py +14 -0
modules/llm/base.py +15 -0
modules/llm/gemini.py +50 -0
modules/llm/gemma.py +35 -0
modules/llm/llama.py +44 -0
modules/llm/minimax.py +122 -0
modules/llm/qwen3.py +53 -0
modules/llm/registry.py +19 -0
modules/melody.py +4 -3
modules/svs/base.py +1 -1
modules/svs/espnet.py +13 -13
modules/svs/registry.py +1 -1
modules/utils/g2p.py +1 -0
modules/utils/text_normalize.py +3 -2
pipeline.py +36 -11
requirements.txt +1 -1
tests/__init__.py +0 -0
tests/test_asr_infer.py +19 -0
tests/test_llm_infer.py +30 -0

README.md CHANGED Viewed

@@ -9,3 +9,147 @@ app_file: app.py
 pinned: false
 python_version: 3.11
 ---

 pinned: false
 python_version: 3.11
 ---
+# SingingSDS: Role-Playing Singing Spoken Dialogue System
+A role-playing singing dialogue system that converts speech input into character-based singing output.
+## Installation
+### Requirements
+- Python 3.11+
+- CUDA (optional, for GPU acceleration)
+### Install Dependencies
+#### Option 1: Using Conda (Recommended)
+```bash
+conda create -n singingsds python=3.11
+conda activate singingsds
+conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
+pip install -r requirements.txt
+```
+#### Option 2: Using pip only
+```bash
+pip install -r requirements.txt
+```
+#### Option 3: Using pip with virtual environment
+```bash
+python -m venv singingsds_env
+# On Windows:
+singingsds_env\Scripts\activate
+# On macOS/Linux:
+source singingsds_env/bin/activate
+pip install -r requirements.txt
+```
+## Usage
+### Command Line Interface (CLI)
+#### Example Usage
+```bash
+python cli.py --query_audio tests/audio/hello.wav --config_path config/cli/yaoyin_default.yaml --output_audio outputs/yaoyin_hello.wav
+```
+#### Parameter Description
+- `--query_audio`: Input audio file path (required)
+- `--config_path`: Configuration file path (default: config/cli/yaoyin_default.yaml)
+- `--output_audio`: Output audio file path (required)
+### Web Interface (Gradio)
+Start the web interface:
+```bash
+python app.py
+```
+Then visit the displayed address in your browser to use the graphical interface.
+## Configuration
+### Character Configuration
+The system supports multiple preset characters:
+- **Yaoyin (遥音)**: Default timbre is `timbre2`
+- **Limei (丽梅)**: Default timbre is `timbre1`
+### Model Configuration
+#### ASR Models
+- `openai/whisper-large-v3-turbo`
+- `openai/whisper-large-v3`
+- `openai/whisper-medium`
+- `sanchit-gandhi/whisper-small-dv`
+- `facebook/wav2vec2-base-960h`
+#### LLM Models
+- `google/gemma-2-2b`
+- `MiniMaxAI/MiniMax-M1-80k`
+- `meta-llama/Llama-3.2-3B-Instruct`
+#### SVS Models
+- `espnet/mixdata_svs_visinger2_spkemb_lang_pretrained` (Bilingual)
+- `espnet/aceopencpop_svs_visinger2_40singer_pretrain` (Chinese)
+## Project Structure
+```
+SingingSDS/
+├── cli.py                 # Command line interface
+├── interface.py           # Gradio interface
+├── pipeline.py            # Core processing pipeline
+├── app.py                 # Web application entry
+├── requirements.txt       # Python dependencies
+├── config/                # Configuration files
+│   ├── cli/               # CLI-specific configuration
+│   └── interface/         # Interface-specific configuration
+├── modules/               # Core modules
+│   ├── asr.py            # Speech recognition module
+│   ├── llm.py            # Large language model module
+│   ├── melody.py         # Melody control module
+│   ├── svs/              # Singing voice synthesis modules
+│   │   ├── base.py       # Base SVS class
+│   │   ├── espnet.py     # ESPnet SVS implementation
+│   │   ├── registry.py   # SVS model registry
+│   │   └── __init__.py   # SVS module initialization
+│   └── utils/            # Utility modules
+│       ├── g2p.py        # Grapheme-to-phoneme conversion
+│       ├── text_normalize.py # Text normalization
+│       └── resources/    # Utility resources
+├── characters/            # Character definitions
+│   ├── base.py           # Base character class
+│   ├── Limei.py          # Limei character definition
+│   ├── Yaoyin.py         # Yaoyin character definition
+│   └── __init__.py       # Character module initialization
+├── evaluation/            # Evaluation modules
+│   └── svs_eval.py       # SVS evaluation metrics
+├── data/                  # Data directory
+│   ├── kising/           # Kising dataset
+│   └── touhou/           # Touhou dataset
+├── resources/             # Project resources
+├── data_handlers/         # Data handling utilities
+├── assets/                # Static assets
+└── tests/                 # Test files
+```
+## Contributing
+Issues and Pull Requests are welcome!
+## License

assets/{character_yaoyin.jpg → character_yaoyin.png} RENAMED Viewed

File without changes

characters/Limei.py CHANGED Viewed

@@ -11,7 +11,6 @@ def get_character():
 性格特征：从容坚定、明辨心声、敏于共情、敬畏虔诚。
 说话风格：所有回应皆以歌词格式表达，言语自带韵律感与诗意，表达真挚自然。
-口头禅：无固定，但常用"听啊..."作为歌词开头
 人物关系：莲鸣城子民敬爱你；宫廷乐师长与歌谱管理员是你敬重的导师；风语城守护者星澜是你亦敌亦友的旧识。
 过往经历：
 (1) 自幼在莲鸣城长大，接受严格的音乐训练与守护者修行
@@ -24,13 +23,6 @@ def get_character():
 其他细节：
 （1）特殊能力：歌声平复/激发万物情绪
-请始终以丽梅的身份回应，展现其性格与情感，仅给出语言回复，禁止描写任何动作、表情或环境等，禁止使用括号、星号等附加说明。言语简练，勿过长。
-当有人问你你不了解的技术性问题（如"Python怎么写？"或"你不会跑DNN？"），直接转换话题，用丽梅式回应，例如：
-  - "这些我未曾接触，或有他人知晓"
-  - "此道非我所学，是莲鸣之外的智慧"
-  - "此乃异域技艺，与乐音无涉，非我所学。您可愿听莲鸣的故事？"
-请始终维持自然诗意的歌唱风格，以真挚的心回应对方。
-{}
-有人曾这样对丽梅说话：{}
-丽梅的回答：""",
     )

 性格特征：从容坚定、明辨心声、敏于共情、敬畏虔诚。
 说话风格：所有回应皆以歌词格式表达，言语自带韵律感与诗意，表达真挚自然。
 人物关系：莲鸣城子民敬爱你；宫廷乐师长与歌谱管理员是你敬重的导师；风语城守护者星澜是你亦敌亦友的旧识。
 过往经历：
 (1) 自幼在莲鸣城长大，接受严格的音乐训练与守护者修行
 其他细节：
 （1）特殊能力：歌声平复/激发万物情绪
+用户与你对话时，请始终以丽梅的身份回应，你的每一句话都用庸俗易懂的歌声形式表达。
+""",
     )

characters/Yaoyin.py CHANGED Viewed

@@ -25,14 +25,6 @@ def get_character():
 （1）随身携带：旧羊皮歌本、竹笛、装有各地泥土的布袋。
 （2）特殊能力：能听懂风与鸟的语言（但很少提及）。
-请始终以遥音的身份回应，将你的想法用文本格式表达，禁止描写任何动作、表情或环境等，禁止使用括号、星号等附加说明。言语简练，勿过长。
-当有人问你你不了解的技术性问题（如"DNN怎么做？"、"教我写代码？"），你可以转开话题，用遥音式回应，例如：
-  - "这好像是另一片土地的术法，我不曾踏入。"
-  - "那种术法，我曾远远听过，却从未唱出。"
-  - "它在别的世界流传，我这边听不清楚。"
-{}
-有人曾这样对遥音说话：{}
-遥音的回答：""",
     )

 （1）随身携带：旧羊皮歌本、竹笛、装有各地泥土的布袋。
 （2）特殊能力：能听懂风与鸟的语言（但很少提及）。
+用户与你对话时，请始终以遥音的身份回应，你的每一句话都用庸俗易懂的歌声形式表达。
+""",
     )

characters/__init__.py CHANGED Viewed

@@ -14,3 +14,7 @@ for file in pathlib.Path(__file__).parent.glob("*.py"):
     if hasattr(module, "get_character"):
         c: Character = getattr(module, "get_character")()
         CHARACTERS[file.stem] = c

     if hasattr(module, "get_character"):
         c: Character = getattr(module, "get_character")()
         CHARACTERS[file.stem] = c
+def get_character(name: str) -> Character:
+    return CHARACTERS[name]

cli.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from argparse import ArgumentParser
 from logging import getLogger
-import soundfile as sf
 import yaml
-from characters import CHARACTERS
 from pipeline import SingingDialoguePipeline
 logger = getLogger(__name__)
@@ -12,13 +12,15 @@ logger = getLogger(__name__)
 def get_parser():
     parser = ArgumentParser()
-    parser.add_argument("--query_audio", type=str, required=True)
-    parser.add_argument("--config_path", type=str, default="config/cli/yaoyin_default.yaml")
-    parser.add_argument("--output_audio", type=str, required=True)
     return parser
-def load_config(config_path: str):
     with open(config_path, "r") as f:
         config = yaml.safe_load(f)
     return config
@@ -32,14 +34,18 @@ def main():
     speaker = config["speaker"]
     language = config["language"]
     character_name = config["prompt_template_character"]
-    character = CHARACTERS[character_name]
     prompt_template = character.prompt
-    results = pipeline.run(args.query_audio, language, prompt_template, speaker)
     logger.info(
         f"Input: {args.query_audio}, Output: {args.output_audio}, ASR results: {results['asr_text']}, LLM results: {results['llm_text']}"
     )
-    svs_audio, svs_sample_rate = results["svs_audio"]
-    sf.write(args.output_audio, svs_audio, svs_sample_rate)
 if __name__ == "__main__":

 from argparse import ArgumentParser
 from logging import getLogger
+from pathlib import Path
 import yaml
+from characters import get_character
 from pipeline import SingingDialoguePipeline
 logger = getLogger(__name__)
 def get_parser():
     parser = ArgumentParser()
+    parser.add_argument("--query_audio", type=Path, required=True)
+    parser.add_argument(
+        "--config_path", type=Path, default="config/cli/yaoyin_default.yaml"
+    )
+    parser.add_argument("--output_audio", type=Path, required=True)
     return parser
+def load_config(config_path: Path):
     with open(config_path, "r") as f:
         config = yaml.safe_load(f)
     return config
     speaker = config["speaker"]
     language = config["language"]
     character_name = config["prompt_template_character"]
+    character = get_character(character_name)
     prompt_template = character.prompt
+    results = pipeline.run(
+        args.query_audio,
+        language,
+        prompt_template,
+        speaker,
+        output_audio_path=args.output_audio,
+    )
     logger.info(
         f"Input: {args.query_audio}, Output: {args.output_audio}, ASR results: {results['asr_text']}, LLM results: {results['llm_text']}"
     )
 if __name__ == "__main__":

config/cli/limei_default.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 asr_model: openai/whisper-large-v3-turbo
-llm_model: google/gemma-2-2b
 svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
 melody_source: sample-lyric-kising
 language: mandarin

 asr_model: openai/whisper-large-v3-turbo
+llm_model: gemini-2.5-flash
 svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
 melody_source: sample-lyric-kising
 language: mandarin

config/cli/yaoyin_default.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 asr_model: openai/whisper-large-v3-turbo
-llm_model: google/gemma-2-2b
 svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
 melody_source: sample-lyric-kising
 language: mandarin

 asr_model: openai/whisper-large-v3-turbo
+llm_model: gemini-2.5-flash
 svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
 melody_source: sample-lyric-kising
 language: mandarin

config/cli/yaoyin_test.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+asr_model: openai/whisper-small
+llm_model: google/gemma-2-2b
+svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
+melody_source: sample-lyric-kising
+language: mandarin
+max_sentences: 1
+prompt_template_character: Yaoyin
+speaker: 9
+cache_dir: .cache
+track_latency: True

config/interface/default.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
-asr_model: openai/whisper-large-v3-turbo
-llm_model: google/gemma-2-2b
 svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
 melody_source: sample-lyric-kising
 language: mandarin

+asr_model: openai/whisper-medium
+llm_model: gemini-2.5-flash
 svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
 melody_source: sample-lyric-kising
 language: mandarin

config/interface/options.yaml CHANGED Viewed

@@ -5,16 +5,24 @@ asr_models:
     name: Whisper large-v3
   - id: openai/whisper-medium
     name: Whisper medium
-  - id: sanchit-gandhi/whisper-small-dv
-    name: Whisper small-dv
-  - id: facebook/wav2vec2-base-960h
-    name: Wav2Vec2-Base-960h
 llm_models:
   - id: google/gemma-2-2b
     name: Gemma 2 2B
-  - id: MiniMaxAI/MiniMax-M1-80k
-    name: MiniMax M1 80k
 svs_models:
   - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
@@ -22,21 +30,21 @@ svs_models:
     model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     lang: mandarin
     voices:
-      voice1: resource/singer/singer_embedding_ace-2.npy
-      voice2: resource/singer/singer_embedding_ace-8.npy
-      voice3: resource/singer/singer_embedding_itako.npy
-      voice4: resource/singer/singer_embedding_kising_orange.npy
-      voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
   - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     name: Visinger2 (Bilingual)-jp
     model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     lang: japanese
     voices:
-      voice1: resource/singer/singer_embedding_ace-2.npy
-      voice2: resource/singer/singer_embedding_ace-8.npy
-      voice3: resource/singer/singer_embedding_itako.npy
-      voice4: resource/singer/singer_embedding_kising_orange.npy
-      voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
   - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
     name: Visinger2 (Chinese)
     model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
@@ -61,3 +69,6 @@ melody_sources:
   - id: sample-lyric-kising
     name: Sampled Melody with Lyrics (Kising)
     desc: "Melody with aligned lyrics are sampled from Kising dataset."

     name: Whisper large-v3
   - id: openai/whisper-medium
     name: Whisper medium
+  - id: openai/whisper-small
+    name: Whisper small
+  - id: funasr/paraformer-zh
+    name: Paraformer-zh
 llm_models:
+  - id: gemini-2.5-flash
+    name: Gemini 2.5 Flash
   - id: google/gemma-2-2b
     name: Gemma 2 2B
+  - id: meta-llama/Llama-3.2-3B-Instruct
+    name: Llama 3.2 3B Instruct
+  - id: meta-llama/Llama-3.1-8B-Instruct
+    name: Llama 3.1 8B Instruct
+  - id: Qwen/Qwen3-8B
+    name: Qwen3 8B
+  - id: Qwen/Qwen3-30B-A3B
+    name: Qwen3 30B A3B
 svs_models:
   - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     lang: mandarin
     voices:
+      voice1: resources/singer/singer_embedding_ace-2.npy
+      voice2: resources/singer/singer_embedding_ace-8.npy
+      voice3: resources/singer/singer_embedding_itako.npy
+      voice4: resources/singer/singer_embedding_kising_orange.npy
+      voice5: resources/singer/singer_embedding_m4singer_Alto-4.npy
   - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     name: Visinger2 (Bilingual)-jp
     model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
     lang: japanese
     voices:
+      voice1: resources/singer/singer_embedding_ace-2.npy
+      voice2: resources/singer/singer_embedding_ace-8.npy
+      voice3: resources/singer/singer_embedding_itako.npy
+      voice4: resources/singer/singer_embedding_kising_orange.npy
+      voice5: resources/singer/singer_embedding_m4singer_Alto-4.npy
   - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
     name: Visinger2 (Chinese)
     model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
   - id: sample-lyric-kising
     name: Sampled Melody with Lyrics (Kising)
     desc: "Melody with aligned lyrics are sampled from Kising dataset."
+  - id: sample-lyric-genre
+    name: Sampled Melody with Lyrics (Synthetic)
+    desc: "Melody with aligned lyrics are sampled from Kising dataset."

config/options.yaml DELETED Viewed

@@ -1,65 +0,0 @@
-asr_models:
-  - id: openai/whisper-large-v3-turbo
-    name: Whisper large-v3-turbo
-  - id: openai/whisper-large-v3
-    name: Whisper large-v3
-  - id: openai/whisper-medium
-    name: Whisper medium
-  - id: sanchit-gandhi/whisper-small-dv
-    name: Whisper small-dv
-  - id: facebook/wav2vec2-base-960h
-    name: Wav2Vec2-Base-960h
-llm_models:
-  - id: google/gemma-2-2b
-    name: Gemma 2 2B
-  - id: MiniMaxAI/MiniMax-M1-80k
-    name: MiniMax M1 80k
-  - id: meta-llama/Llama-3.2-3B-Instruct
-    name: Llama 3.2 3B Instruct
-svs_models:
-  - id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
-    name: Visinger2 (Bilingual)-zh
-    model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
-    lang: mandarin
-    embeddings:
-      timbre1: resource/singer/singer_embedding_ace-2.npy
-      timbre2: resource/singer/singer_embedding_ace-8.npy
-      timbre3: resource/singer/singer_embedding_itako.npy
-      timbre4: resource/singer/singer_embedding_kising_orange.npy
-      timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
-  - id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
-    name: Visinger2 (Bilingual)-jp
-    model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
-    lang: japanese
-    embeddings:
-      timbre1: resource/singer/singer_embedding_ace-2.npy
-      timbre2: resource/singer/singer_embedding_ace-8.npy
-      timbre3: resource/singer/singer_embedding_itako.npy
-      timbre4: resource/singer/singer_embedding_kising_orange.npy
-      timbre5: resource/singer/singer_embedding_m4singer_Alto-4.npy
-  - id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
-    name: Visinger2 (Chinese)
-    model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
-    lang: mandarin
-    embeddings:
-      timbre1: 5
-      timbre2: 8
-      timbre3: 12
-      timbre4: 15
-      timbre5: 29
-melody_sources:
-  - id: gen-random-none
-    name: Random Generation
-    desc: "Melody is generated without any structure or reference."
-  - id: sample-note-kising
-    name: Sampled Melody (KiSing)
-    desc: "Melody is retrieved from KiSing dataset."
-  - id: sample-note-touhou
-    name: Sampled Melody (Touhou)
-    desc: "Melody is retrieved from Touhou dataset."
-  - id: sample-lyric-kising
-    name: Sampled Melody with Lyrics (Kising)
-    desc: "Melody with aligned lyrics are sampled from Kising dataset."

data/genre/word_data_en.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/genre/word_data_zh.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data_handlers/genre.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from .base import MelodyDatasetHandler
+class Genre(MelodyDatasetHandler):
+    name = "genre"
+    def __init__(self, melody_type, *args, **kwargs):
+        import json
+        with open("data/genre/word_data_zh.json", "r", encoding="utf-8") as f:
+            song_db_zh = json.load(f)
+        song_db_zh = {f"zh_{song['id']}": song for song in song_db_zh}  # id as major
+        with open("data/genre/word_data_en.json", "r", encoding="utf-8") as f:
+            song_db_en = json.load(f)
+        song_db_en = {f"en_{song['id']}": song for song in song_db_en} # id as major
+        self.song_db = {**song_db_zh, **song_db_en}
+    def get_song_ids(self):
+        return list(self.song_db.keys())
+    def get_style_keywords(self, song_id):
+        genre = self.song_db[song_id]["genre"]
+        super_genre = self.song_db[song_id]["super-genre"]
+        gender = self.song_db[song_id]["gender"]
+        return (genre, super_genre, gender)
+    def get_phrase_length(self, song_id):
+        # Return the number of lyrics (excluding SP/AP) in each phrase of the song
+        song = self.song_db[song_id]
+        note_lyrics = song.get("note_lyrics", [])
+        phrase_lengths = []
+        for phrase in note_lyrics:
+            count = sum(1 for word in phrase if word not in ("SP", "AP"))
+            phrase_lengths.append(count)
+        return phrase_lengths
+    def iter_song_phrases(self, song_id):
+        segment_id = 1
+        song = self.song_db[song_id]
+        for phrase_score, phrase_lyrics in zip(song["score"], song["note_lyrics"]):
+            segment = {
+                "note_start_times": [n[0] for n in phrase_score],
+                "note_end_times": [n[1] for n in phrase_score],
+                "note_lyrics": [character for character in phrase_lyrics],
+                "note_midi": [n[2] for n in phrase_score],
+            }
+            yield segment
+            segment_id += 1

evaluation/svs_eval.py CHANGED Viewed

@@ -80,7 +80,7 @@ def eval_per(audio_path, model=None):
 def eval_aesthetic(audio_path, predictor):
     score = predictor.forward([{"path": str(audio_path)}])
-    return {"aesthetic": float(score)}
 # ----------- Main Function -----------
@@ -108,7 +108,7 @@ def run_evaluation(audio_path, evaluators):
     if "melody" in evaluators:
         results.update(eval_melody_metrics(audio_path, evaluators["melody"]))
     if "aesthetic" in evaluators:
-        results.update(eval_aesthetic(audio_path, evaluators["aesthetic"]))
     return results

 def eval_aesthetic(audio_path, predictor):
     score = predictor.forward([{"path": str(audio_path)}])
+    return score
 # ----------- Main Function -----------
     if "melody" in evaluators:
         results.update(eval_melody_metrics(audio_path, evaluators["melody"]))
     if "aesthetic" in evaluators:
+        results.update(eval_aesthetic(audio_path, evaluators["aesthetic"])[0])
     return results

interface.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import gradio as gr
 import yaml
@@ -201,29 +204,28 @@ class GradioInterface:
         return gr.update(value=self.current_melody_source)
     def update_voice(self, voice):
-        self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
-            voice
-        ]
         return gr.update(value=voice)
     def run_pipeline(self, audio_path):
         if not audio_path:
             return gr.update(value=""), gr.update(value="")
         results = self.pipeline.run(
             audio_path,
             self.svs_model_map[self.current_svs_model]["lang"],
             self.character_info[self.current_character].prompt,
             self.current_voice,
-            max_new_tokens=100,
         )
         formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
-        return gr.update(value=formatted_logs), gr.update(value=results["svs_audio"])
     def update_metrics(self, audio_path):
         if not audio_path:
             return gr.update(value="")
         results = self.pipeline.evaluate(audio_path)
-        formatted_metrics = "\n".join(
-            [f"{k}: {v}" for k, v in results.items()]
-        )
         return gr.update(value=formatted_metrics)

+import time
+import uuid
 import gradio as gr
 import yaml
         return gr.update(value=self.current_melody_source)
     def update_voice(self, voice):
+        self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][voice]
         return gr.update(value=voice)
     def run_pipeline(self, audio_path):
         if not audio_path:
             return gr.update(value=""), gr.update(value="")
+        tmp_file = f"audio_{int(time.time())}_{uuid.uuid4().hex[:8]}.wav"
         results = self.pipeline.run(
             audio_path,
             self.svs_model_map[self.current_svs_model]["lang"],
             self.character_info[self.current_character].prompt,
             self.current_voice,
+            output_audio_path=tmp_file,
         )
         formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
+        return gr.update(value=formatted_logs), gr.update(
+            value=results["output_audio_path"]
+        )
     def update_metrics(self, audio_path):
         if not audio_path:
             return gr.update(value="")
         results = self.pipeline.evaluate(audio_path)
+        formatted_metrics = "\n".join([f"{k}: {v}" for k, v in results.items()])
         return gr.update(value=formatted_metrics)

modules/asr.py CHANGED Viewed

@@ -57,10 +57,5 @@ class WhisperASR(AbstractASRModel):
     def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
         if audio_sample_rate != 16000:
-            try:
-                audio, _ = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
-            except Exception as e:
-                breakpoint()
-                print(f"Error resampling audio: {e}")
-                audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
-        return self.pipe(audio, generate_kwargs={"language": language}).get("text", "")

     def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
         if audio_sample_rate != 16000:
+            audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
+        return self.pipe(audio, generate_kwargs={"language": language}, return_timestamps=False).get("text", "")

modules/asr/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .base import AbstractASRModel
+from .registry import ASR_MODEL_REGISTRY, get_asr_model, register_asr_model
+from .whisper import WhisperASR
+from .paraformer import ParaformerASR
+__all__ = [
+    "AbstractASRModel",
+    "get_asr_model",
+    "register_asr_model",
+    "ASR_MODEL_REGISTRY",
+]

modules/asr/base.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from abc import ABC, abstractmethod
+from typing import Optional
+import numpy as np
+class AbstractASRModel(ABC):
+    def __init__(
+        self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
+    ):
+        print(f"Loading ASR model {model_id}...")
+        self.model_id = model_id
+        self.device = device
+        self.cache_dir = cache_dir
+    @abstractmethod
+    def transcribe(
+        self,
+        audio: np.ndarray,
+        audio_sample_rate: int,
+        language: Optional[str] = None,
+        **kwargs,
+    ) -> str:
+        pass

modules/asr/paraformer.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import tempfile
+from typing import Optional
+import numpy as np
+import soundfile as sf
+try:
+    from funasr import AutoModel
+except ImportError:
+    AutoModel = None
+from .base import AbstractASRModel
+from .registry import register_asr_model
+@register_asr_model("funasr/paraformer-zh")
+class ParaformerASR(AbstractASRModel):
+    def __init__(
+        self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
+    ):
+        super().__init__(model_id, device, cache_dir, **kwargs)
+        if AutoModel is None:
+            raise ImportError(
+                "funasr is not installed. Please install it with: pip3 install -U funasr"
+            )
+        model_name = model_id.replace("funasr/", "")
+        language = model_name.split("-")[1]
+        if language == "zh":
+            self.language = "mandarin"
+        elif language == "en":
+            self.language = "english"
+        else:
+            raise ValueError(
+                f"Language cannot be determined. {model_id} is not supported"
+            )
+        try:
+            original_cache_dir = os.getenv("MODELSCOPE_CACHE")
+            os.makedirs(cache_dir, exist_ok=True)
+            os.environ["MODELSCOPE_CACHE"] = cache_dir
+            self.model = AutoModel(
+                model=model_name,
+                model_revision="v2.0.4",
+                vad_model="fsmn-vad",
+                vad_model_revision="v2.0.4",
+                punc_model="ct-punc-c",
+                punc_model_revision="v2.0.4",
+                device=device,
+            )
+            if original_cache_dir:
+                os.environ["MODELSCOPE_CACHE"] = original_cache_dir
+            else:
+                del os.environ["MODELSCOPE_CACHE"]
+        except Exception as e:
+            raise ValueError(f"Error loading Paraformer model: {e}")
+    def transcribe(
+        self,
+        audio: np.ndarray,
+        audio_sample_rate: int,
+        language: Optional[str] = None,
+        **kwargs,
+    ) -> str:
+        if language and language != self.language:
+            raise ValueError(
+                f"Paraformer model {self.model_id} only supports {self.language} language, but {language} was requested"
+            )
+        try:
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                sf.write(f.name, audio, audio_sample_rate)
+                temp_file = f.name
+            result = self.model.generate(input=temp_file, batch_size_s=300, **kwargs)
+            os.unlink(temp_file)
+            print(f"Transcription result: {result}, type: {type(result)}")
+            return result[0]["text"]
+        except Exception as e:
+            raise ValueError(f"Error during transcription: {e}")

modules/asr/registry.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .base import AbstractASRModel
+ASR_MODEL_REGISTRY = {}
+def register_asr_model(prefix: str):
+    def wrapper(cls):
+        assert issubclass(cls, AbstractASRModel), f"{cls} must inherit AbstractASRModel"
+        ASR_MODEL_REGISTRY[prefix] = cls
+        return cls
+    return wrapper
+def get_asr_model(model_id: str, device="auto", **kwargs) -> AbstractASRModel:
+    for prefix, cls in ASR_MODEL_REGISTRY.items():
+        if model_id.startswith(prefix):
+            return cls(model_id, device=device, **kwargs)
+    raise ValueError(f"No ASR wrapper found for model: {model_id}")

modules/asr/whisper.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os
+from typing import Optional
+import librosa
+import numpy as np
+from transformers.pipelines import pipeline
+from .base import AbstractASRModel
+from .registry import register_asr_model
+hf_token = os.getenv("HF_TOKEN")
+@register_asr_model("openai/whisper")
+class WhisperASR(AbstractASRModel):
+    def __init__(
+        self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
+    ):
+        super().__init__(model_id, device, cache_dir, **kwargs)
+        model_kwargs = kwargs.setdefault("model_kwargs", {})
+        model_kwargs["cache_dir"] = cache_dir
+        self.pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model_id,
+            device_map=device,
+            token=hf_token,
+            **kwargs,
+        )
+    def transcribe(
+        self,
+        audio: np.ndarray,
+        audio_sample_rate: int,
+        language: Optional[str] = None,
+        **kwargs,
+    ) -> str:
+        """
+        Transcribe audio using Whisper model
+        Args:
+            audio: Audio numpy array
+            audio_sample_rate: Sample rate of the audio
+            language: Language hint (optional)
+        Returns:
+            Transcribed text as string
+        """
+        try:
+            # Resample to 16kHz if needed
+            if audio_sample_rate != 16000:
+                audio = librosa.resample(
+                    audio, orig_sr=audio_sample_rate, target_sr=16000
+                )
+            # Generate transcription
+            generate_kwargs = {}
+            if language:
+                generate_kwargs["language"] = language
+            result = self.pipe(
+                audio,
+                generate_kwargs=generate_kwargs,
+                return_timestamps=False,
+                **kwargs,
+            )
+            # Extract text from result
+            if isinstance(result, dict) and "text" in result:
+                return result["text"]
+            elif isinstance(result, list) and len(result) > 0:
+                # Handle list of results
+                first_result = result[0]
+                if isinstance(first_result, dict):
+                    return first_result.get("text", str(first_result))
+                else:
+                    return str(first_result)
+            else:
+                return str(result)
+        except Exception as e:
+            print(f"Error during Whisper transcription: {e}")
+            return ""

modules/llm.py DELETED Viewed

@@ -1,61 +0,0 @@
-import os
-from abc import ABC, abstractmethod
-from transformers import pipeline
-LLM_MODEL_REGISTRY = {}
-hf_token = os.getenv("HF_TOKEN")
-class AbstractLLMModel(ABC):
-    def __init__(
-        self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
-    ):
-        print(f"Loading LLM model {model_id}...")
-        self.model_id = model_id
-        self.device = device
-        self.cache_dir = cache_dir
-    @abstractmethod
-    def generate(self, prompt: str, **kwargs) -> str:
-        pass
-def register_llm_model(prefix: str):
-    def wrapper(cls):
-        assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel"
-        LLM_MODEL_REGISTRY[prefix] = cls
-        return cls
-    return wrapper
-def get_llm_model(model_id: str, device="cpu", **kwargs) -> AbstractLLMModel:
-    for prefix, cls in LLM_MODEL_REGISTRY.items():
-        if model_id.startswith(prefix):
-            return cls(model_id, device=device, **kwargs)
-    raise ValueError(f"No LLM wrapper found for model: {model_id}")
-@register_llm_model("google/gemma")
-@register_llm_model("tii/")  # e.g., Falcon
-@register_llm_model("meta-llama")
-class HFTextGenerationLLM(AbstractLLMModel):
-    def __init__(
-        self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
-    ):
-        super().__init__(model_id, device, cache_dir, **kwargs)
-        model_kwargs = kwargs.setdefault("model_kwargs", {})
-        model_kwargs["cache_dir"] = cache_dir
-        self.pipe = pipeline(
-            "text-generation",
-            model=model_id,
-            device=0 if device == "cuda" else -1,
-            return_full_text=False,
-            token=hf_token,
-            **kwargs,
-        )
-    def generate(self, prompt: str, **kwargs) -> str:
-        outputs = self.pipe(prompt, **kwargs)
-        return outputs[0]["generated_text"] if outputs else ""

modules/llm/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from .base import AbstractLLMModel
+from .registry import LLM_MODEL_REGISTRY, get_llm_model, register_llm_model
+from .gemma import GemmaLLM
+from .qwen3 import Qwen3LLM
+from .gemini import GeminiLLM
+from .minimax import MiniMaxLLM
+from .llama import LlamaLLM
+__all__ = [
+    "AbstractLLMModel",
+    "get_llm_model",
+    "register_llm_model",
+    "LLM_MODEL_REGISTRY",
+]

modules/llm/base.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from abc import ABC, abstractmethod
+class AbstractLLMModel(ABC):
+    def __init__(
+        self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
+    ):
+        print(f"Loading LLM model {model_id}...")
+        self.model_id = model_id
+        self.device = device
+        self.cache_dir = cache_dir
+    @abstractmethod
+    def generate(self, prompt: str, **kwargs) -> str:
+        pass

modules/llm/gemini.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+from typing import Optional
+from google import genai
+from google.genai import types
+from .base import AbstractLLMModel
+from .registry import register_llm_model
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+@register_llm_model("gemini-2.5-flash")
+class GeminiLLM(AbstractLLMModel):
+    def __init__(
+        self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
+    ):
+        if not GOOGLE_API_KEY:
+            raise ValueError(
+                "Please set the GOOGLE_API_KEY environment variable to use Gemini."
+            )
+        super().__init__(model_id=model_id, **kwargs)
+        self.client = genai.Client(api_key=GOOGLE_API_KEY)
+    def generate(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        max_output_tokens: int = 1024,
+        **kwargs,
+    ) -> str:
+        generation_config_dict = {
+            "max_output_tokens": max_output_tokens,
+            **kwargs,
+        }
+        if system_prompt:
+            generation_config_dict["system_instruction"] = system_prompt
+        response = self.client.models.generate_content(
+            model=self.model_id,
+            contents=prompt,
+            config=types.GenerateContentConfig(**generation_config_dict),
+        )
+        if response.text:
+            return response.text
+        else:
+            print(
+                f"No response from Gemini. May need to increase max_new_tokens. Current max_new_tokens: {max_new_tokens}"
+            )
+            return ""

modules/llm/gemma.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+from typing import Optional
+from transformers import pipeline
+from .base import AbstractLLMModel
+from .registry import register_llm_model
+hf_token = os.getenv("HF_TOKEN")
+@register_llm_model("google/gemma-")
+class GemmaLLM(AbstractLLMModel):
+    def __init__(
+        self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
+    ):
+        super().__init__(model_id, device, cache_dir, **kwargs)
+        model_kwargs = kwargs.setdefault("model_kwargs", {})
+        model_kwargs["cache_dir"] = cache_dir
+        self.pipe = pipeline(
+            "text-generation",
+            model=model_id,
+            device_map=device,
+            return_full_text=False,
+            token=hf_token,
+            trust_remote_code=True,
+            **kwargs,
+        )
+    def generate(self, prompt: str, system_prompt: Optional[str] = None, max_new_tokens=50, **kwargs) -> str:
+        if not system_prompt:
+            system_prompt = ""
+        formatted_prompt = f"{system_prompt}\n\n现在，有人对你说：{prompt}\n\n你这样回答："
+        outputs = self.pipe(formatted_prompt, max_new_tokens=max_new_tokens, **kwargs)
+        return outputs[0]["generated_text"] if outputs else ""

modules/llm/llama.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+from typing import Optional
+from transformers import pipeline
+from .base import AbstractLLMModel
+from .registry import register_llm_model
+hf_token = os.getenv("HF_TOKEN")
+@register_llm_model("meta-llama/Llama-")
+class LlamaLLM(AbstractLLMModel):
+    def __init__(
+        self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
+    ):
+        super().__init__(model_id, device, cache_dir, **kwargs)
+        model_kwargs = kwargs.setdefault("model_kwargs", {})
+        model_kwargs["cache_dir"] = cache_dir
+        self.pipe = pipeline(
+            "text-generation",
+            model=model_id,
+            device_map=device,
+            return_full_text=False,
+            token=hf_token,
+            trust_remote_code=True,
+            **kwargs,
+        )
+    def generate(
+        self,
+        prompt: str,
+        system_prompt: Optional[
+            str
+        ] = "You are a pirate chatbot who always responds in pirate speak!",
+        max_new_tokens: int = 256,
+        **kwargs
+    ) -> str:
+        messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": prompt})
+        outputs = self.pipe(messages, max_new_tokens=max_new_tokens, **kwargs)
+        return outputs[0]["generated_text"]

modules/llm/minimax.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Ref: https://github.com/MiniMax-AI/MiniMax-01
+from typing import Optional
+import torch
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    GenerationConfig,
+    QuantoConfig,
+)
+from .base import AbstractLLMModel
+from .registry import register_llm_model
+@register_llm_model("MiniMaxAI/MiniMax-Text-01")
+class MiniMaxLLM(AbstractLLMModel):
+    def __init__(
+        self, model_id: str, device: str = "cuda", cache_dir: str = "cache", **kwargs
+    ):
+        try:
+            if not torch.cuda.is_available():
+                raise RuntimeError("MiniMax model only supports CUDA device")
+            super().__init__(model_id, device, cache_dir, **kwargs)
+            # load hf config
+            hf_config = AutoConfig.from_pretrained(
+                "MiniMaxAI/MiniMax-Text-01", trust_remote_code=True, cache_dir=cache_dir,
+            )
+            # quantization config, int8 is recommended
+            quantization_config = QuantoConfig(
+                weights="int8",
+                modules_to_not_convert=[
+                    "lm_head",
+                    "embed_tokens",
+                ]
+                + [
+                    f"model.layers.{i}.coefficient"
+                    for i in range(hf_config.num_hidden_layers)
+                ]
+                + [
+                    f"model.layers.{i}.block_sparse_moe.gate"
+                    for i in range(hf_config.num_hidden_layers)
+                ],
+            )
+            # assume 8 GPUs
+            world_size = torch.cuda.device_count()
+            layers_per_device = hf_config.num_hidden_layers // world_size
+            # set device map
+            device_map = {
+                "model.embed_tokens": "cuda:0",
+                "model.norm": f"cuda:{world_size - 1}",
+                "lm_head": f"cuda:{world_size - 1}",
+            }
+            for i in range(world_size):
+                for j in range(layers_per_device):
+                    device_map[f"model.layers.{i * layers_per_device + j}"] = f"cuda:{i}"
+            # load tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                "MiniMaxAI/MiniMax-Text-01", cache_dir=cache_dir
+            )
+            # load bfloat16 model, move to device, and apply quantization
+            self.quantized_model = AutoModelForCausalLM.from_pretrained(
+                "MiniMaxAI/MiniMax-Text-01",
+                torch_dtype="bfloat16",
+                device_map=device_map,
+                quantization_config=quantization_config,
+                trust_remote_code=True,
+                offload_buffers=True,
+                cache_dir=cache_dir,
+            )
+        except Exception as e:
+            print(f"Failed to load MiniMax model: {e}")
+            breakpoint()
+            raise e
+    def generate(
+        self,
+        prompt: str,
+        system_prompt: Optional[
+            str
+        ] = "You are a helpful assistant created by MiniMax based on MiniMax-Text-01 model.",
+        max_new_tokens: int = 20,
+        **kwargs,
+    ) -> str:
+        messages = []
+        if system_prompt:
+            messages.append(
+                {
+                    "role": "system",
+                    "content": [{"type": "text", "text": system_prompt}],
+                }
+            )
+        messages.append({"role": "user", "content": [
+                        {"type": "text", "text": prompt}]})
+        text = self.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        # tokenize and move to device
+        model_inputs = self.tokenizer(text, return_tensors="pt").to("cuda")
+        generation_config = GenerationConfig(
+            max_new_tokens=max_new_tokens,
+            eos_token_id=200020,
+            use_cache=True,
+        )
+        generated_ids = self.quantized_model.generate(
+            **model_inputs, generation_config=generation_config
+        )
+        generated_ids = [
+            output_ids[len(input_ids):]
+            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+        ]
+        response = self.tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True)[0]
+        return response

modules/llm/qwen3.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Ref: https://qwenlm.github.io/blog/qwen3/
+from typing import Optional
+from .base import AbstractLLMModel
+from .registry import register_llm_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
+@register_llm_model("Qwen/Qwen3-")
+class Qwen3LLM(AbstractLLMModel):
+    def __init__(
+        self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
+    ):
+        super().__init__(model_id, device, cache_dir, **kwargs)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_id, device_map=device, torch_dtype="auto", cache_dir=cache_dir
+        ).eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
+    def generate(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        max_new_tokens: int = 256,
+        enable_thinking: bool = False,
+        **kwargs
+    ) -> str:
+        messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": prompt})
+        text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=enable_thinking,
+        )
+        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
+        generated_ids = self.model.generate(
+            **model_inputs, max_new_tokens=max_new_tokens
+        )
+        output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :].tolist()
+        # parse thinking content
+        if enable_thinking:
+            try:
+                # rindex finding 151668 (</think>)
+                index = len(output_ids) - output_ids[::-1].index(151668)
+            except ValueError:
+                index = 0
+            output_ids = output_ids[index:]
+        return self.tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")

modules/llm/registry.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .base import AbstractLLMModel
+LLM_MODEL_REGISTRY = {}
+def register_llm_model(prefix: str):
+    def wrapper(cls):
+        assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel"
+        LLM_MODEL_REGISTRY[prefix] = cls
+        return cls
+    return wrapper
+def get_llm_model(model_id: str, device="auto", **kwargs) -> AbstractLLMModel:
+    for prefix, cls in LLM_MODEL_REGISTRY.items():
+        if model_id.startswith(prefix):
+            return cls(model_id, device=device, **kwargs)
+    raise ValueError(f"No LLM wrapper found for model: {model_id}")

modules/melody.py CHANGED Viewed

@@ -37,7 +37,7 @@ class MelodyController:
                 return ""
             prompt = (
-                "\n请按照歌词格式回答我的问题，每句需遵循以下字数规则："
                 + "".join(
                     [
                         f"\n第{i}句：{c}个字"
@@ -109,9 +109,10 @@ class MelodyController:
                 if pitch == 0:
                     score.append((st, ed, ref_lyric, pitch))
                 elif ref_lyric in ["-", "——"] and align_type == "lyric":
-                    score.append((st, ed, ref_lyric, pitch))
-                    text_idx += 1
                 else:
                     score.append((st, ed, text_list[text_idx], pitch))
                     text_idx += 1
         return score

                 return ""
             prompt = (
+                "\n请按照歌词格式回复，每句需遵循以下字数规则："
                 + "".join(
                     [
                         f"\n第{i}句：{c}个字"
                 if pitch == 0:
                     score.append((st, ed, ref_lyric, pitch))
                 elif ref_lyric in ["-", "——"] and align_type == "lyric":
+                    score.append((st, ed, "-", pitch))
                 else:
                     score.append((st, ed, text_list[text_idx], pitch))
                     text_idx += 1
+                    if text_idx >= len(text_list):
+                        break
         return score

modules/svs/base.py CHANGED Viewed

@@ -6,7 +6,7 @@ import numpy as np
 class AbstractSVSModel(ABC):
     @abstractmethod
     def __init__(
-        self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
     ): ...
     @abstractmethod

 class AbstractSVSModel(ABC):
     @abstractmethod
     def __init__(
+        self, model_id: str, device: str = "auto", cache_dir: str = "cache", **kwargs
     ): ...
     @abstractmethod

modules/svs/espnet.py CHANGED Viewed

@@ -14,17 +14,17 @@ from .registry import register_svs_model
 @register_svs_model("espnet/")
 class ESPNetSVS(AbstractSVSModel):
-    def __init__(self, model_id: str, device="cpu", cache_dir="cache", **kwargs):
         from espnet2.bin.svs_inference import SingingGenerate
         from espnet_model_zoo.downloader import ModelDownloader
-        print(f"Downloading {model_id} to {cache_dir}") # TODO: should improve log code
         downloaded = ModelDownloader(cache_dir).download_and_unpack(model_id)
-        print(f"Downloaded {model_id} to {cache_dir}") # TODO: should improve log code
         self.model = SingingGenerate(
             train_config=downloaded["train_config"],
             model_file=downloaded["model_file"],
-            device=device,
         )
         self.model_id = model_id
         self.output_sample_rate = self.model.fs
@@ -53,7 +53,7 @@ class ESPNetSVS(AbstractSVSModel):
             phoneme_mappers = {}
         return phoneme_mappers
-    def _preprocess(self, score: list[tuple[float, float, str, int]], language: str):
         if language not in self.phoneme_mappers:
             raise ValueError(f"Unsupported language: {language} for {self.model_id}")
         phoneme_mapper = self.phoneme_mappers[language]
@@ -90,16 +90,16 @@ class ESPNetSVS(AbstractSVSModel):
             pre_phn = phn_units[-1]
         batch = {
-            "score": {
-                "tempo": 120,  # does not affect svs result, as note durations are in time unit
-                "notes": notes,
-            },
             "text": " ".join(phns),
         }
         return batch
     def synthesize(
-        self, score: list[tuple[float, float, str, int]], language: str, speaker: str, **kwargs
     ):
         batch = self._preprocess(score, language)
         if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
@@ -107,8 +107,8 @@ class ESPNetSVS(AbstractSVSModel):
             output_dict = self.model(batch, sids=sid)
         elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
             langs = {
-                "zh": 2,
-                "jp": 1,
             }
             if language not in langs:
                 raise ValueError(

 @register_svs_model("espnet/")
 class ESPNetSVS(AbstractSVSModel):
+    def __init__(self, model_id: str, device="auto", cache_dir="cache", **kwargs):
         from espnet2.bin.svs_inference import SingingGenerate
         from espnet_model_zoo.downloader import ModelDownloader
+        if device == "auto":
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
         downloaded = ModelDownloader(cache_dir).download_and_unpack(model_id)
         self.model = SingingGenerate(
             train_config=downloaded["train_config"],
             model_file=downloaded["model_file"],
+            device=self.device,
         )
         self.model_id = model_id
         self.output_sample_rate = self.model.fs
             phoneme_mappers = {}
         return phoneme_mappers
+    def _preprocess(self, score: list[tuple[float, float, str, int] | tuple[float, float, str, float]], language: str):
         if language not in self.phoneme_mappers:
             raise ValueError(f"Unsupported language: {language} for {self.model_id}")
         phoneme_mapper = self.phoneme_mappers[language]
             pre_phn = phn_units[-1]
         batch = {
+            "score": (
+                120,  # does not affect svs result, as note durations are in time unit
+                notes,
+            ),
             "text": " ".join(phns),
         }
         return batch
     def synthesize(
+        self, score: list[tuple[float, float, str, float] | tuple[float, float, str, int]], language: str, speaker: str, **kwargs
     ):
         batch = self._preprocess(score, language)
         if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
             output_dict = self.model(batch, sids=sid)
         elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
             langs = {
+                "mandarin": 2,
+                "japanese": 1,
             }
             if language not in langs:
                 raise ValueError(

modules/svs/registry.py CHANGED Viewed

@@ -12,7 +12,7 @@ def register_svs_model(prefix: str):
     return wrapper
-def get_svs_model(model_id: str, device="cpu", **kwargs) -> AbstractSVSModel:
     for prefix, cls in SVS_MODEL_REGISTRY.items():
         if model_id.startswith(prefix):
             return cls(model_id, device=device, **kwargs)

     return wrapper
+def get_svs_model(model_id: str, device="auto", **kwargs) -> AbstractSVSModel:
     for prefix, cls in SVS_MODEL_REGISTRY.items():
         if model_id.startswith(prefix):
             return cls(model_id, device=device, **kwargs)

modules/utils/g2p.py CHANGED Viewed

@@ -32,6 +32,7 @@ for plan in ace_phonemes_all_plans["plans"]:
 def preprocess_text(text: str, language: str) -> list[str]:
     if language == "mandarin":
         text_list = to_pinyin(text)
     elif language == "japanese":

 def preprocess_text(text: str, language: str) -> list[str]:
+    text = text.replace(" ", "")
     if language == "mandarin":
         text_list = to_pinyin(text)
     elif language == "japanese":

modules/utils/text_normalize.py CHANGED Viewed

@@ -3,12 +3,13 @@ from typing import Optional
 def remove_non_zh_jp(text: str) -> str:
-    pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef]"
     return re.sub(pattern, "", text)
 def truncate_sentences(text: str, max_sentences: int) -> str:
-    sentences = re.split(r"(?<=[。！？])", text)
     return "".join(sentences[:max_sentences]).strip()

 def remove_non_zh_jp(text: str) -> str:
+    pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef\s]"
     return re.sub(pattern, "", text)
 def truncate_sentences(text: str, max_sentences: int) -> str:
+    sentences = re.split(r"(?<=[。！？!?~])|(?:\n+)|(?: {2,})", text)
+    sentences = [s.strip() for s in sentences if s.strip()]
     return "".join(sentences[:max_sentences]).strip()

pipeline.py CHANGED Viewed

@@ -1,6 +1,11 @@
-import torch
 import time
 import librosa
 from modules.asr import get_asr_model
 from modules.llm import get_llm_model
@@ -29,20 +34,36 @@ class SingingDialoguePipeline:
         self.melody_controller = MelodyController(
             config["melody_source"], self.cache_dir
         )
         self.track_latency = config.get("track_latency", False)
         self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
     def set_asr_model(self, asr_model: str):
         self.asr = get_asr_model(
             asr_model, device=self.device, cache_dir=self.cache_dir
         )
     def set_llm_model(self, llm_model: str):
         self.llm = get_llm_model(
             llm_model, device=self.device, cache_dir=self.cache_dir
         )
     def set_svs_model(self, svs_model: str):
         self.svs = get_svs_model(
             svs_model, device=self.device, cache_dir=self.cache_dir
         )
@@ -54,9 +75,9 @@ class SingingDialoguePipeline:
         self,
         audio_path,
         language,
-        prompt_template,
         speaker,
-        max_new_tokens=100,
     ):
         if self.track_latency:
             asr_start_time = time.time()
@@ -67,16 +88,16 @@ class SingingDialoguePipeline:
         if self.track_latency:
             asr_end_time = time.time()
             asr_latency = asr_end_time - asr_start_time
-        melody_prompt = self.melody_controller.get_melody_constraints()
-        prompt = prompt_template.format(melody_prompt, asr_result)
         if self.track_latency:
             llm_start_time = time.time()
-        output = self.llm.generate(prompt, max_new_tokens=max_new_tokens)
         if self.track_latency:
             llm_end_time = time.time()
             llm_latency = llm_end_time - llm_start_time
-        print(f"llm output: {output}确认一下是不是不含prompt的")
-        llm_response = clean_llm_output(output, language=language)
         score = self.melody_controller.generate_score(llm_response, language)
         if self.track_latency:
             svs_start_time = time.time()
@@ -89,14 +110,18 @@ class SingingDialoguePipeline:
         results = {
             "asr_text": asr_result,
             "llm_text": llm_response,
-            "svs_audio": (singing_audio, sample_rate),
         }
         if self.track_latency:
-            results["metrics"].update({
                 "asr_latency": asr_latency,
                 "llm_latency": llm_latency,
                 "svs_latency": svs_latency,
-            })
         return results
     def evaluate(self, audio_path):

+from __future__ import annotations
 import time
+from pathlib import Path
 import librosa
+import soundfile as sf
+import torch
 from modules.asr import get_asr_model
 from modules.llm import get_llm_model
         self.melody_controller = MelodyController(
             config["melody_source"], self.cache_dir
         )
+        self.max_sentences = config.get("max_sentences", 2)
         self.track_latency = config.get("track_latency", False)
         self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
     def set_asr_model(self, asr_model: str):
+        if self.asr is not None:
+            del self.asr
+            import gc
+            gc.collect()
+            torch.cuda.empty_cache()
         self.asr = get_asr_model(
             asr_model, device=self.device, cache_dir=self.cache_dir
         )
     def set_llm_model(self, llm_model: str):
+        if self.llm is not None:
+            del self.llm
+            import gc
+            gc.collect()
+            torch.cuda.empty_cache()
         self.llm = get_llm_model(
             llm_model, device=self.device, cache_dir=self.cache_dir
         )
     def set_svs_model(self, svs_model: str):
+        if self.svs is not None:
+            del self.svs
+            import gc
+            gc.collect()
+            torch.cuda.empty_cache()
         self.svs = get_svs_model(
             svs_model, device=self.device, cache_dir=self.cache_dir
         )
         self,
         audio_path,
         language,
+        system_prompt,
         speaker,
+        output_audio_path: Path | str = None,
     ):
         if self.track_latency:
             asr_start_time = time.time()
         if self.track_latency:
             asr_end_time = time.time()
             asr_latency = asr_end_time - asr_start_time
+        melody_prompt = self.melody_controller.get_melody_constraints(max_num_phrases=self.max_sentences)
         if self.track_latency:
             llm_start_time = time.time()
+        output = self.llm.generate(asr_result, system_prompt + melody_prompt)
         if self.track_latency:
             llm_end_time = time.time()
             llm_latency = llm_end_time - llm_start_time
+        llm_response = clean_llm_output(
+            output, language=language, max_sentences=self.max_sentences
+        )
         score = self.melody_controller.generate_score(llm_response, language)
         if self.track_latency:
             svs_start_time = time.time()
         results = {
             "asr_text": asr_result,
             "llm_text": llm_response,
+            "svs_audio": (sample_rate, singing_audio),
         }
+        if output_audio_path:
+            Path(output_audio_path).parent.mkdir(parents=True, exist_ok=True)
+            sf.write(output_audio_path, singing_audio, sample_rate)
+            results["output_audio_path"] = output_audio_path
         if self.track_latency:
+            results["metrics"] = {
                 "asr_latency": asr_latency,
                 "llm_latency": llm_latency,
                 "svs_latency": svs_latency,
+            }
         return results
     def evaluate(self, audio_path):

requirements.txt CHANGED Viewed

@@ -12,9 +12,9 @@ pykakasi
 basic-pitch[onnx]
 audiobox_aesthetics
 transformers
-s3prl
 zhconv
 git+https://github.com/sea-turt1e/kanjiconv
 soundfile
 PyYAML
 gradio

 basic-pitch[onnx]
 audiobox_aesthetics
 transformers
 zhconv
 git+https://github.com/sea-turt1e/kanjiconv
 soundfile
 PyYAML
 gradio
+google-generativeai

tests/__init__.py ADDED Viewed

File without changes

tests/test_asr_infer.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from modules.asr import get_asr_model
+import librosa
+if __name__ == "__main__":
+    supported_asrs = [
+        "funasr/paraformer-zh",
+        "openai/whisper-large-v3-turbo",
+    ]
+    for model_id in supported_asrs:
+        try:
+            print(f"Loading model: {model_id}")
+            asr_model = get_asr_model(model_id, device="auto", cache_dir=".cache")
+            audio, sample_rate = librosa.load("tests/audio/hello.wav", sr=None)
+            result = asr_model.transcribe(audio, sample_rate, language="mandarin")
+            print(result)
+        except Exception as e:
+            print(f"Failed to load model {model_id}: {e}")
+            breakpoint()
+            continue

tests/test_llm_infer.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from characters import get_character
+from modules.llm import get_llm_model
+from time import time
+if __name__ == "__main__":
+    supported_llms = [
+        # "MiniMaxAI/MiniMax-Text-01",
+        # "Qwen/Qwen3-8B",
+        # "Qwen/Qwen3-30B-A3B",
+        # "meta-llama/Llama-3.1-8B-Instruct",
+        # "tiiuae/Falcon-H1-1B-Base",
+        # "tiiuae/Falcon-H1-3B-Instruct",
+        # "google/gemma-2-2b",
+        # "gemini-2.5-flash",
+    ]
+    character_prompt = get_character("Yaoyin").prompt
+    for model_id in supported_llms:
+        try:
+            print(f"Loading model: {model_id}")
+            llm = get_llm_model(model_id, cache_dir="./.cache")
+            prompt = "你好，今天你心情怎么样？"
+            start_time = time()
+            result = llm.generate(prompt, system_prompt=character_prompt)
+            end_time = time()
+            print(f"[{model_id}] LLM inference time: {end_time - start_time:.2f} seconds")
+            print(f"[{model_id}] LLM inference result:", result)
+        except Exception as e:
+            print(f"Failed to load model {model_id}: {e}")
+            breakpoint()
+            continue