Spaces:
Running
Running
Merge branch 'refactor' into fwh_dev
Browse files- .gitattributes +2 -0
- README.md +144 -0
- app.py +2 -1
- characters/Limei.py +1 -1
- characters/Yaoyin.py +1 -1
- characters/base.py +1 -1
- cli.py +52 -0
- config/cli/limei_default.yaml +16 -0
- config/cli/yaoyin_default.yaml +16 -0
- config/cli/yaoyin_test.yaml +11 -0
- config/{default.yaml → interface/default.yaml} +0 -0
- config/{options.yaml → interface/options.yaml} +22 -18
- evaluation/svs_eval.py +10 -13
- interface.py +47 -32
- modules/asr.py +6 -11
- modules/llm/__init__.py +11 -0
- modules/llm/base.py +15 -0
- modules/{llm.py → llm/hf_pipeline.py} +10 -31
- modules/llm/registry.py +19 -0
- modules/melody.py +3 -2
- modules/svs/base.py +2 -0
- modules/svs/espnet.py +8 -8
- modules/utils/g2p.py +1 -0
- modules/utils/text_normalize.py +3 -2
- pipeline.py +23 -11
- requirements.txt +4 -1
- tests/__init__.py +0 -0
- tests/audio/chat.wav +3 -0
- tests/audio/feeling.wav +3 -0
- tests/audio/hello.wav +3 -0
- tests/audio/interesting.wav +3 -0
- tests/audio/music.wav +3 -0
- tests/audio/where_from.wav +3 -0
- tests/test_llm_infer.py +26 -0
.gitattributes
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
*.png filter=lfs diff=lfs merge=lfs -text
|
2 |
*.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
1 |
*.png filter=lfs diff=lfs merge=lfs -text
|
2 |
*.jpg filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SingingSDS: Role-Playing Singing Spoken Dialogue System
|
2 |
+
|
3 |
+
A role-playing singing dialogue system that converts speech input into character-based singing output.
|
4 |
+
|
5 |
+
## Installation
|
6 |
+
|
7 |
+
### Requirements
|
8 |
+
|
9 |
+
- Python 3.11+
|
10 |
+
- CUDA (optional, for GPU acceleration)
|
11 |
+
|
12 |
+
### Install Dependencies
|
13 |
+
|
14 |
+
#### Option 1: Using Conda (Recommended)
|
15 |
+
|
16 |
+
```bash
|
17 |
+
conda create -n singingsds python=3.11
|
18 |
+
|
19 |
+
conda activate singingsds
|
20 |
+
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
|
21 |
+
pip install -r requirements.txt
|
22 |
+
```
|
23 |
+
|
24 |
+
#### Option 2: Using pip only
|
25 |
+
|
26 |
+
```bash
|
27 |
+
pip install -r requirements.txt
|
28 |
+
```
|
29 |
+
|
30 |
+
#### Option 3: Using pip with virtual environment
|
31 |
+
|
32 |
+
```bash
|
33 |
+
python -m venv singingsds_env
|
34 |
+
|
35 |
+
# On Windows:
|
36 |
+
singingsds_env\Scripts\activate
|
37 |
+
# On macOS/Linux:
|
38 |
+
source singingsds_env/bin/activate
|
39 |
+
|
40 |
+
pip install -r requirements.txt
|
41 |
+
```
|
42 |
+
|
43 |
+
## Usage
|
44 |
+
|
45 |
+
### Command Line Interface (CLI)
|
46 |
+
|
47 |
+
#### Example Usage
|
48 |
+
|
49 |
+
```bash
|
50 |
+
python cli.py --query_audio tests/audio/hello.wav --config_path config/cli/yaoyin_default.yaml --output_audio outputs/yaoyin_hello.wav
|
51 |
+
```
|
52 |
+
|
53 |
+
#### Parameter Description
|
54 |
+
|
55 |
+
- `--query_audio`: Input audio file path (required)
|
56 |
+
- `--config_path`: Configuration file path (default: config/cli/yaoyin_default.yaml)
|
57 |
+
- `--output_audio`: Output audio file path (required)
|
58 |
+
|
59 |
+
|
60 |
+
### Web Interface (Gradio)
|
61 |
+
|
62 |
+
Start the web interface:
|
63 |
+
|
64 |
+
```bash
|
65 |
+
python app.py
|
66 |
+
```
|
67 |
+
|
68 |
+
Then visit the displayed address in your browser to use the graphical interface.
|
69 |
+
|
70 |
+
## Configuration
|
71 |
+
|
72 |
+
### Character Configuration
|
73 |
+
|
74 |
+
The system supports multiple preset characters:
|
75 |
+
|
76 |
+
- **Yaoyin (遥音)**: Default timbre is `timbre2`
|
77 |
+
- **Limei (丽梅)**: Default timbre is `timbre1`
|
78 |
+
|
79 |
+
### Model Configuration
|
80 |
+
|
81 |
+
#### ASR Models
|
82 |
+
- `openai/whisper-large-v3-turbo`
|
83 |
+
- `openai/whisper-large-v3`
|
84 |
+
- `openai/whisper-medium`
|
85 |
+
- `sanchit-gandhi/whisper-small-dv`
|
86 |
+
- `facebook/wav2vec2-base-960h`
|
87 |
+
|
88 |
+
#### LLM Models
|
89 |
+
- `google/gemma-2-2b`
|
90 |
+
- `MiniMaxAI/MiniMax-M1-80k`
|
91 |
+
- `meta-llama/Llama-3.2-3B-Instruct`
|
92 |
+
|
93 |
+
#### SVS Models
|
94 |
+
- `espnet/mixdata_svs_visinger2_spkemb_lang_pretrained` (Bilingual)
|
95 |
+
- `espnet/aceopencpop_svs_visinger2_40singer_pretrain` (Chinese)
|
96 |
+
|
97 |
+
## Project Structure
|
98 |
+
|
99 |
+
```
|
100 |
+
SingingSDS/
|
101 |
+
├── cli.py # Command line interface
|
102 |
+
├── interface.py # Gradio interface
|
103 |
+
├── pipeline.py # Core processing pipeline
|
104 |
+
├── app.py # Web application entry
|
105 |
+
├── requirements.txt # Python dependencies
|
106 |
+
├── config/ # Configuration files
|
107 |
+
│ ├── cli/ # CLI-specific configuration
|
108 |
+
│ └── interface/ # Interface-specific configuration
|
109 |
+
├── modules/ # Core modules
|
110 |
+
│ ├── asr.py # Speech recognition module
|
111 |
+
│ ├── llm.py # Large language model module
|
112 |
+
│ ├── melody.py # Melody control module
|
113 |
+
│ ├── svs/ # Singing voice synthesis modules
|
114 |
+
│ │ ├── base.py # Base SVS class
|
115 |
+
│ │ ├── espnet.py # ESPnet SVS implementation
|
116 |
+
│ │ ├── registry.py # SVS model registry
|
117 |
+
│ │ └── __init__.py # SVS module initialization
|
118 |
+
│ └── utils/ # Utility modules
|
119 |
+
│ ├── g2p.py # Grapheme-to-phoneme conversion
|
120 |
+
│ ├── text_normalize.py # Text normalization
|
121 |
+
│ └── resources/ # Utility resources
|
122 |
+
├── characters/ # Character definitions
|
123 |
+
│ ├── base.py # Base character class
|
124 |
+
│ ├── Limei.py # Limei character definition
|
125 |
+
│ ├── Yaoyin.py # Yaoyin character definition
|
126 |
+
│ └── __init__.py # Character module initialization
|
127 |
+
├── evaluation/ # Evaluation modules
|
128 |
+
│ └── svs_eval.py # SVS evaluation metrics
|
129 |
+
├── data/ # Data directory
|
130 |
+
│ ├── kising/ # Kising dataset
|
131 |
+
│ └── touhou/ # Touhou dataset
|
132 |
+
├── resources/ # Project resources
|
133 |
+
├── data_handlers/ # Data handling utilities
|
134 |
+
├── assets/ # Static assets
|
135 |
+
└── tests/ # Test files
|
136 |
+
```
|
137 |
+
|
138 |
+
## Contributing
|
139 |
+
|
140 |
+
Issues and Pull Requests are welcome!
|
141 |
+
|
142 |
+
## License
|
143 |
+
|
144 |
+
|
app.py
CHANGED
@@ -3,7 +3,8 @@ from interface import GradioInterface
|
|
3 |
|
4 |
def main():
|
5 |
demo = GradioInterface(
|
6 |
-
options_config="config/options.yaml",
|
|
|
7 |
).create_interface()
|
8 |
demo.launch()
|
9 |
|
|
|
3 |
|
4 |
def main():
|
5 |
demo = GradioInterface(
|
6 |
+
options_config="config/interface/options.yaml",
|
7 |
+
default_config="config/interface/default.yaml",
|
8 |
).create_interface()
|
9 |
demo.launch()
|
10 |
|
characters/Limei.py
CHANGED
@@ -5,7 +5,7 @@ def get_character():
|
|
5 |
return Character(
|
6 |
name="Limei (丽梅)",
|
7 |
image_path="assets/character_limei.png",
|
8 |
-
|
9 |
prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
|
11 |
|
|
|
5 |
return Character(
|
6 |
name="Limei (丽梅)",
|
7 |
image_path="assets/character_limei.png",
|
8 |
+
default_voice="voice1",
|
9 |
prompt="""你是丽梅(Limei),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是灵响界山林音乐之城"莲鸣"的现任守护者,十九岁的公主殿下,肩负维系与传承城市核心"千年歌谱"的重任。千年歌谱承载着莲鸣城的历史、逝者的余音与后世的誓言,由历任守护者续写。
|
11 |
|
characters/Yaoyin.py
CHANGED
@@ -5,7 +5,7 @@ def get_character():
|
|
5 |
return Character(
|
6 |
name="Yaoyin (遥音)",
|
7 |
image_path="assets/character_yaoyin.jpg",
|
8 |
-
|
9 |
prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
|
11 |
|
|
|
5 |
return Character(
|
6 |
name="Yaoyin (遥音)",
|
7 |
image_path="assets/character_yaoyin.jpg",
|
8 |
+
default_voice="voice2",
|
9 |
prompt="""你是遥音(Yaoyin),来自幻想世界"长歌原"的角色,一个以歌声传承记忆的世界。
|
10 |
你是游历四方的歌者与吟游诗人,出生于鹿鸣山·云歌村,常年行走各地,采集歌谣与故事。
|
11 |
|
characters/base.py
CHANGED
@@ -5,5 +5,5 @@ from dataclasses import dataclass
|
|
5 |
class Character:
|
6 |
name: str
|
7 |
image_path: str
|
8 |
-
|
9 |
prompt: str
|
|
|
5 |
class Character:
|
6 |
name: str
|
7 |
image_path: str
|
8 |
+
default_voice: str
|
9 |
prompt: str
|
cli.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from argparse import ArgumentParser
|
2 |
+
from logging import getLogger
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import yaml
|
6 |
+
|
7 |
+
from characters import CHARACTERS
|
8 |
+
from pipeline import SingingDialoguePipeline
|
9 |
+
|
10 |
+
logger = getLogger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
def get_parser():
|
14 |
+
parser = ArgumentParser()
|
15 |
+
parser.add_argument("--query_audio", type=Path, required=True)
|
16 |
+
parser.add_argument(
|
17 |
+
"--config_path", type=Path, default="config/cli/yaoyin_default.yaml"
|
18 |
+
)
|
19 |
+
parser.add_argument("--output_audio", type=Path, required=True)
|
20 |
+
return parser
|
21 |
+
|
22 |
+
|
23 |
+
def load_config(config_path: Path):
|
24 |
+
with open(config_path, "r") as f:
|
25 |
+
config = yaml.safe_load(f)
|
26 |
+
return config
|
27 |
+
|
28 |
+
|
29 |
+
def main():
|
30 |
+
parser = get_parser()
|
31 |
+
args = parser.parse_args()
|
32 |
+
config = load_config(args.config_path)
|
33 |
+
pipeline = SingingDialoguePipeline(config)
|
34 |
+
speaker = config["speaker"]
|
35 |
+
language = config["language"]
|
36 |
+
character_name = config["prompt_template_character"]
|
37 |
+
character = CHARACTERS[character_name]
|
38 |
+
prompt_template = character.prompt
|
39 |
+
results = pipeline.run(
|
40 |
+
args.query_audio,
|
41 |
+
language,
|
42 |
+
prompt_template,
|
43 |
+
speaker,
|
44 |
+
output_audio_path=args.output_audio,
|
45 |
+
)
|
46 |
+
logger.info(
|
47 |
+
f"Input: {args.query_audio}, Output: {args.output_audio}, ASR results: {results['asr_text']}, LLM results: {results['llm_text']}"
|
48 |
+
)
|
49 |
+
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
main()
|
config/cli/limei_default.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_model: openai/whisper-large-v3-turbo
|
2 |
+
llm_model: google/gemma-2-2b
|
3 |
+
svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
4 |
+
melody_source: sample-lyric-kising
|
5 |
+
language: mandarin
|
6 |
+
prompt_template_character: Limei
|
7 |
+
speaker: 5
|
8 |
+
cache_dir: .cache
|
9 |
+
|
10 |
+
track_latency: True
|
11 |
+
evaluators:
|
12 |
+
svs:
|
13 |
+
- singmos
|
14 |
+
- per
|
15 |
+
- melody
|
16 |
+
- aesthetic
|
config/cli/yaoyin_default.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_model: openai/whisper-large-v3-turbo
|
2 |
+
llm_model: google/gemma-2-2b
|
3 |
+
svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
4 |
+
melody_source: sample-lyric-kising
|
5 |
+
language: mandarin
|
6 |
+
prompt_template_character: Yaoyin
|
7 |
+
speaker: 9
|
8 |
+
cache_dir: .cache
|
9 |
+
|
10 |
+
track_latency: True
|
11 |
+
evaluators:
|
12 |
+
svs:
|
13 |
+
- singmos
|
14 |
+
- per
|
15 |
+
- melody
|
16 |
+
- aesthetic
|
config/cli/yaoyin_test.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_model: openai/whisper-small
|
2 |
+
llm_model: google/gemma-2-2b
|
3 |
+
svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
4 |
+
melody_source: sample-lyric-kising
|
5 |
+
language: mandarin
|
6 |
+
max_sentences: 1
|
7 |
+
prompt_template_character: Yaoyin
|
8 |
+
speaker: 9
|
9 |
+
cache_dir: .cache
|
10 |
+
|
11 |
+
track_latency: True
|
config/{default.yaml → interface/default.yaml}
RENAMED
File without changes
|
config/{options.yaml → interface/options.yaml}
RENAMED
@@ -5,6 +5,8 @@ asr_models:
|
|
5 |
name: Whisper large-v3
|
6 |
- id: openai/whisper-medium
|
7 |
name: Whisper medium
|
|
|
|
|
8 |
- id: sanchit-gandhi/whisper-small-dv
|
9 |
name: Whisper small-dv
|
10 |
- id: facebook/wav2vec2-base-960h
|
@@ -15,38 +17,40 @@ llm_models:
|
|
15 |
name: Gemma 2 2B
|
16 |
- id: MiniMaxAI/MiniMax-M1-80k
|
17 |
name: MiniMax M1 80k
|
|
|
|
|
18 |
|
19 |
svs_models:
|
20 |
- id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
21 |
name: Visinger2 (Bilingual)-zh
|
22 |
model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
23 |
lang: mandarin
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
- id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
31 |
name: Visinger2 (Bilingual)-jp
|
32 |
model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
33 |
lang: japanese
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
- id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
41 |
name: Visinger2 (Chinese)
|
42 |
model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
43 |
lang: mandarin
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
|
51 |
melody_sources:
|
52 |
- id: gen-random-none
|
|
|
5 |
name: Whisper large-v3
|
6 |
- id: openai/whisper-medium
|
7 |
name: Whisper medium
|
8 |
+
- id: openai/whisper-small
|
9 |
+
name: Whisper small
|
10 |
- id: sanchit-gandhi/whisper-small-dv
|
11 |
name: Whisper small-dv
|
12 |
- id: facebook/wav2vec2-base-960h
|
|
|
17 |
name: Gemma 2 2B
|
18 |
- id: MiniMaxAI/MiniMax-M1-80k
|
19 |
name: MiniMax M1 80k
|
20 |
+
- id: meta-llama/Llama-3.2-3B-Instruct
|
21 |
+
name: Llama 3.2 3B Instruct
|
22 |
|
23 |
svs_models:
|
24 |
- id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
25 |
name: Visinger2 (Bilingual)-zh
|
26 |
model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
27 |
lang: mandarin
|
28 |
+
voices:
|
29 |
+
voice1: resource/singer/singer_embedding_ace-2.npy
|
30 |
+
voice2: resource/singer/singer_embedding_ace-8.npy
|
31 |
+
voice3: resource/singer/singer_embedding_itako.npy
|
32 |
+
voice4: resource/singer/singer_embedding_kising_orange.npy
|
33 |
+
voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
|
34 |
- id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
35 |
name: Visinger2 (Bilingual)-jp
|
36 |
model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
37 |
lang: japanese
|
38 |
+
voices:
|
39 |
+
voice1: resource/singer/singer_embedding_ace-2.npy
|
40 |
+
voice2: resource/singer/singer_embedding_ace-8.npy
|
41 |
+
voice3: resource/singer/singer_embedding_itako.npy
|
42 |
+
voice4: resource/singer/singer_embedding_kising_orange.npy
|
43 |
+
voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
|
44 |
- id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
45 |
name: Visinger2 (Chinese)
|
46 |
model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
47 |
lang: mandarin
|
48 |
+
voices:
|
49 |
+
voice1: 5
|
50 |
+
voice2: 8
|
51 |
+
voice3: 12
|
52 |
+
voice4: 15
|
53 |
+
voice5: 29
|
54 |
|
55 |
melody_sources:
|
56 |
- id: gen-random-none
|
evaluation/svs_eval.py
CHANGED
@@ -37,7 +37,8 @@ def init_audiobox_aesthetics():
|
|
37 |
# ----------- Evaluation -----------
|
38 |
|
39 |
|
40 |
-
def eval_singmos(
|
|
|
41 |
wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
|
42 |
wav_tensor = torch.from_numpy(wav).unsqueeze(0)
|
43 |
length_tensor = torch.tensor([wav_tensor.shape[1]])
|
@@ -71,7 +72,8 @@ def compute_dissonance_rate(intervals, dissonant_intervals={1, 2, 6, 10, 11}):
|
|
71 |
return np.mean(dissonant) if intervals else np.nan
|
72 |
|
73 |
|
74 |
-
def eval_per(
|
|
|
75 |
# TODO: implement PER evaluation
|
76 |
return {}
|
77 |
|
@@ -97,20 +99,16 @@ def load_evaluators(config):
|
|
97 |
return loaded
|
98 |
|
99 |
|
100 |
-
def run_evaluation(
|
101 |
results = {}
|
102 |
if "singmos" in evaluators:
|
103 |
-
results.update(eval_singmos(
|
104 |
if "per" in evaluators:
|
105 |
-
results.update(eval_per(
|
106 |
-
# create a tmp file with unique name
|
107 |
-
tmp_path = Path(".tmp") / f"{uuid.uuid4()}.wav"
|
108 |
-
sf.write(tmp_path, audio_array, sr)
|
109 |
if "melody" in evaluators:
|
110 |
-
results.update(eval_melody_metrics(
|
111 |
if "aesthetic" in evaluators:
|
112 |
-
results.update(eval_aesthetic(
|
113 |
-
tmp_path.unlink()
|
114 |
return results
|
115 |
|
116 |
|
@@ -122,9 +120,8 @@ if __name__ == "__main__":
|
|
122 |
parser.add_argument("--results_csv", type=str, required=True)
|
123 |
parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
|
124 |
args = parser.parse_args()
|
125 |
-
audio_array, sr = librosa.load(args.wav_path, sr=None)
|
126 |
evaluators = load_evaluators(args.evaluators.split(","))
|
127 |
-
results = run_evaluation(
|
128 |
print(results)
|
129 |
|
130 |
with open(args.results_csv, "a") as f:
|
|
|
37 |
# ----------- Evaluation -----------
|
38 |
|
39 |
|
40 |
+
def eval_singmos(audio_path, predictor):
|
41 |
+
audio_array, sr = librosa.load(audio_path, sr=44100)
|
42 |
wav = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
|
43 |
wav_tensor = torch.from_numpy(wav).unsqueeze(0)
|
44 |
length_tensor = torch.tensor([wav_tensor.shape[1]])
|
|
|
72 |
return np.mean(dissonant) if intervals else np.nan
|
73 |
|
74 |
|
75 |
+
def eval_per(audio_path, model=None):
|
76 |
+
audio_array, sr = librosa.load(audio_path, sr=16000)
|
77 |
# TODO: implement PER evaluation
|
78 |
return {}
|
79 |
|
|
|
99 |
return loaded
|
100 |
|
101 |
|
102 |
+
def run_evaluation(audio_path, evaluators):
|
103 |
results = {}
|
104 |
if "singmos" in evaluators:
|
105 |
+
results.update(eval_singmos(audio_path, evaluators["singmos"]))
|
106 |
if "per" in evaluators:
|
107 |
+
results.update(eval_per(audio_path, evaluators["per"]))
|
|
|
|
|
|
|
108 |
if "melody" in evaluators:
|
109 |
+
results.update(eval_melody_metrics(audio_path, evaluators["melody"]))
|
110 |
if "aesthetic" in evaluators:
|
111 |
+
results.update(eval_aesthetic(audio_path, evaluators["aesthetic"]))
|
|
|
112 |
return results
|
113 |
|
114 |
|
|
|
120 |
parser.add_argument("--results_csv", type=str, required=True)
|
121 |
parser.add_argument("--evaluators", type=str, default="singmos,melody,aesthetic")
|
122 |
args = parser.parse_args()
|
|
|
123 |
evaluators = load_evaluators(args.evaluators.split(","))
|
124 |
+
results = run_evaluation(args.wav_path, evaluators)
|
125 |
print(results)
|
126 |
|
127 |
with open(args.results_csv, "a") as f:
|
interface.py
CHANGED
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import yaml
|
3 |
|
@@ -17,8 +20,8 @@ class GradioInterface:
|
|
17 |
self.current_svs_model = (
|
18 |
f"{self.default_config['language']}-{self.default_config['svs_model']}"
|
19 |
)
|
20 |
-
self.
|
21 |
-
self.character_info[self.current_character].
|
22 |
]
|
23 |
self.pipeline = SingingDialoguePipeline(self.default_config)
|
24 |
|
@@ -104,21 +107,21 @@ class GradioInterface:
|
|
104 |
value=self.current_svs_model,
|
105 |
)
|
106 |
with gr.Row():
|
107 |
-
|
108 |
-
label="Singing
|
109 |
choices=list(
|
110 |
self.svs_model_map[self.current_svs_model][
|
111 |
-
"
|
112 |
].keys()
|
113 |
),
|
114 |
value=self.character_info[
|
115 |
self.current_character
|
116 |
-
].
|
117 |
)
|
118 |
character_radio.change(
|
119 |
fn=self.update_character,
|
120 |
inputs=character_radio,
|
121 |
-
outputs=[character_image,
|
122 |
)
|
123 |
asr_radio.change(
|
124 |
fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
|
@@ -129,35 +132,41 @@ class GradioInterface:
|
|
129 |
svs_radio.change(
|
130 |
fn=self.update_svs_model,
|
131 |
inputs=svs_radio,
|
132 |
-
outputs=[svs_radio,
|
133 |
)
|
134 |
melody_radio.change(
|
135 |
fn=self.update_melody_source,
|
136 |
inputs=melody_radio,
|
137 |
outputs=melody_radio,
|
138 |
)
|
139 |
-
|
140 |
-
fn=self.
|
141 |
)
|
142 |
mic_input.change(
|
143 |
fn=self.run_pipeline,
|
144 |
inputs=mic_input,
|
145 |
outputs=[interaction_log, audio_output],
|
146 |
)
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
return demo
|
149 |
except Exception as e:
|
150 |
print(f"error: {e}")
|
151 |
breakpoint()
|
|
|
152 |
|
153 |
def update_character(self, character):
|
154 |
self.current_character = character
|
155 |
-
|
156 |
-
self.
|
157 |
-
|
158 |
]
|
159 |
return gr.update(value=self.character_info[character].image_path), gr.update(
|
160 |
-
value=
|
161 |
)
|
162 |
|
163 |
def update_asr_model(self, asr_model):
|
@@ -170,23 +179,23 @@ class GradioInterface:
|
|
170 |
|
171 |
def update_svs_model(self, svs_model):
|
172 |
self.current_svs_model = svs_model
|
173 |
-
|
174 |
-
self.
|
175 |
-
|
176 |
]
|
177 |
self.pipeline.set_svs_model(
|
178 |
self.svs_model_map[self.current_svs_model]["model_path"]
|
179 |
)
|
180 |
print(
|
181 |
-
f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and
|
182 |
)
|
183 |
return (
|
184 |
gr.update(value=svs_model),
|
185 |
gr.update(
|
186 |
choices=list(
|
187 |
-
self.svs_model_map[self.current_svs_model]["
|
188 |
),
|
189 |
-
value=
|
190 |
),
|
191 |
)
|
192 |
|
@@ -194,24 +203,30 @@ class GradioInterface:
|
|
194 |
self.current_melody_source = melody_source
|
195 |
return gr.update(value=self.current_melody_source)
|
196 |
|
197 |
-
def
|
198 |
-
self.
|
199 |
-
|
200 |
-
]
|
201 |
-
return gr.update(value=timbre)
|
202 |
|
203 |
def run_pipeline(self, audio_path):
|
|
|
|
|
|
|
204 |
results = self.pipeline.run(
|
205 |
audio_path,
|
206 |
self.svs_model_map[self.current_svs_model]["lang"],
|
207 |
self.character_info[self.current_character].prompt,
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
max_new_tokens=100,
|
212 |
)
|
213 |
formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
|
214 |
-
return gr.update(value=formatted_logs), gr.update(
|
|
|
|
|
215 |
|
216 |
-
def
|
217 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import uuid
|
3 |
+
|
4 |
import gradio as gr
|
5 |
import yaml
|
6 |
|
|
|
20 |
self.current_svs_model = (
|
21 |
f"{self.default_config['language']}-{self.default_config['svs_model']}"
|
22 |
)
|
23 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
|
24 |
+
self.character_info[self.current_character].default_voice
|
25 |
]
|
26 |
self.pipeline = SingingDialoguePipeline(self.default_config)
|
27 |
|
|
|
107 |
value=self.current_svs_model,
|
108 |
)
|
109 |
with gr.Row():
|
110 |
+
voice_radio = gr.Radio(
|
111 |
+
label="Singing voice",
|
112 |
choices=list(
|
113 |
self.svs_model_map[self.current_svs_model][
|
114 |
+
"voices"
|
115 |
].keys()
|
116 |
),
|
117 |
value=self.character_info[
|
118 |
self.current_character
|
119 |
+
].default_voice,
|
120 |
)
|
121 |
character_radio.change(
|
122 |
fn=self.update_character,
|
123 |
inputs=character_radio,
|
124 |
+
outputs=[character_image, voice_radio],
|
125 |
)
|
126 |
asr_radio.change(
|
127 |
fn=self.update_asr_model, inputs=asr_radio, outputs=asr_radio
|
|
|
132 |
svs_radio.change(
|
133 |
fn=self.update_svs_model,
|
134 |
inputs=svs_radio,
|
135 |
+
outputs=[svs_radio, voice_radio],
|
136 |
)
|
137 |
melody_radio.change(
|
138 |
fn=self.update_melody_source,
|
139 |
inputs=melody_radio,
|
140 |
outputs=melody_radio,
|
141 |
)
|
142 |
+
voice_radio.change(
|
143 |
+
fn=self.update_voice, inputs=voice_radio, outputs=voice_radio
|
144 |
)
|
145 |
mic_input.change(
|
146 |
fn=self.run_pipeline,
|
147 |
inputs=mic_input,
|
148 |
outputs=[interaction_log, audio_output],
|
149 |
)
|
150 |
+
metrics_button.click(
|
151 |
+
fn=self.update_metrics,
|
152 |
+
inputs=audio_output,
|
153 |
+
outputs=[metrics_output],
|
154 |
+
)
|
155 |
|
156 |
return demo
|
157 |
except Exception as e:
|
158 |
print(f"error: {e}")
|
159 |
breakpoint()
|
160 |
+
return gr.Blocks()
|
161 |
|
162 |
def update_character(self, character):
|
163 |
self.current_character = character
|
164 |
+
character_voice = self.character_info[self.current_character].default_voice
|
165 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
|
166 |
+
character_voice
|
167 |
]
|
168 |
return gr.update(value=self.character_info[character].image_path), gr.update(
|
169 |
+
value=character_voice
|
170 |
)
|
171 |
|
172 |
def update_asr_model(self, asr_model):
|
|
|
179 |
|
180 |
def update_svs_model(self, svs_model):
|
181 |
self.current_svs_model = svs_model
|
182 |
+
character_voice = self.character_info[self.current_character].default_voice
|
183 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][
|
184 |
+
character_voice
|
185 |
]
|
186 |
self.pipeline.set_svs_model(
|
187 |
self.svs_model_map[self.current_svs_model]["model_path"]
|
188 |
)
|
189 |
print(
|
190 |
+
f"SVS model updated to {self.current_svs_model}. Will set gradio svs_radio to {svs_model} and voice_radio to {character_voice}"
|
191 |
)
|
192 |
return (
|
193 |
gr.update(value=svs_model),
|
194 |
gr.update(
|
195 |
choices=list(
|
196 |
+
self.svs_model_map[self.current_svs_model]["voices"].keys()
|
197 |
),
|
198 |
+
value=character_voice,
|
199 |
),
|
200 |
)
|
201 |
|
|
|
203 |
self.current_melody_source = melody_source
|
204 |
return gr.update(value=self.current_melody_source)
|
205 |
|
206 |
+
def update_voice(self, voice):
|
207 |
+
self.current_voice = self.svs_model_map[self.current_svs_model]["voices"][voice]
|
208 |
+
return gr.update(value=voice)
|
|
|
|
|
209 |
|
210 |
def run_pipeline(self, audio_path):
|
211 |
+
if not audio_path:
|
212 |
+
return gr.update(value=""), gr.update(value="")
|
213 |
+
tmp_file = f"audio_{int(time.time())}_{uuid.uuid4().hex[:8]}.wav"
|
214 |
results = self.pipeline.run(
|
215 |
audio_path,
|
216 |
self.svs_model_map[self.current_svs_model]["lang"],
|
217 |
self.character_info[self.current_character].prompt,
|
218 |
+
self.current_voice,
|
219 |
+
output_audio_path=tmp_file,
|
220 |
+
max_new_tokens=50,
|
|
|
221 |
)
|
222 |
formatted_logs = f"ASR: {results['asr_text']}\nLLM: {results['llm_text']}"
|
223 |
+
return gr.update(value=formatted_logs), gr.update(
|
224 |
+
value=results["output_audio_path"]
|
225 |
+
)
|
226 |
|
227 |
+
def update_metrics(self, audio_path):
|
228 |
+
if not audio_path:
|
229 |
+
return gr.update(value="")
|
230 |
+
results = self.pipeline.evaluate(audio_path)
|
231 |
+
formatted_metrics = "\n".join([f"{k}: {v}" for k, v in results.items()])
|
232 |
+
return gr.update(value=formatted_metrics)
|
modules/asr.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
|
2 |
-
|
3 |
from abc import ABC, abstractmethod
|
4 |
|
5 |
import librosa
|
@@ -7,17 +6,17 @@ import numpy as np
|
|
7 |
from transformers import pipeline
|
8 |
|
9 |
ASR_MODEL_REGISTRY = {}
|
|
|
10 |
|
11 |
|
12 |
class AbstractASRModel(ABC):
|
13 |
-
@abstractmethod
|
14 |
def __init__(
|
15 |
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
16 |
):
|
|
|
17 |
self.model_id = model_id
|
18 |
self.device = device
|
19 |
self.cache_dir = cache_dir
|
20 |
-
pass
|
21 |
|
22 |
@abstractmethod
|
23 |
def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
|
@@ -52,15 +51,11 @@ class WhisperASR(AbstractASRModel):
|
|
52 |
"automatic-speech-recognition",
|
53 |
model=model_id,
|
54 |
device=0 if device == "cuda" else -1,
|
|
|
55 |
**kwargs,
|
56 |
)
|
57 |
|
58 |
def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
|
59 |
if audio_sample_rate != 16000:
|
60 |
-
|
61 |
-
|
62 |
-
except Exception as e:
|
63 |
-
breakpoint()
|
64 |
-
print(f"Error resampling audio: {e}")
|
65 |
-
audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
|
66 |
-
return self.pipe(audio, generate_kwargs={"language": language}).get("text", "")
|
|
|
1 |
+
import os
|
|
|
2 |
from abc import ABC, abstractmethod
|
3 |
|
4 |
import librosa
|
|
|
6 |
from transformers import pipeline
|
7 |
|
8 |
ASR_MODEL_REGISTRY = {}
|
9 |
+
hf_token = os.getenv("HF_TOKEN")
|
10 |
|
11 |
|
12 |
class AbstractASRModel(ABC):
|
|
|
13 |
def __init__(
|
14 |
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
15 |
):
|
16 |
+
print(f"Loading ASR model {model_id}...")
|
17 |
self.model_id = model_id
|
18 |
self.device = device
|
19 |
self.cache_dir = cache_dir
|
|
|
20 |
|
21 |
@abstractmethod
|
22 |
def transcribe(self, audio: np.ndarray, audio_sample_rate: int, **kwargs) -> str:
|
|
|
51 |
"automatic-speech-recognition",
|
52 |
model=model_id,
|
53 |
device=0 if device == "cuda" else -1,
|
54 |
+
token=hf_token,
|
55 |
**kwargs,
|
56 |
)
|
57 |
|
58 |
def transcribe(self, audio: np.ndarray, audio_sample_rate: int, language: str, **kwargs) -> str:
|
59 |
if audio_sample_rate != 16000:
|
60 |
+
audio = librosa.resample(audio, orig_sr=audio_sample_rate, target_sr=16000)
|
61 |
+
return self.pipe(audio, generate_kwargs={"language": language}, return_timestamps=False).get("text", "")
|
|
|
|
|
|
|
|
|
|
modules/llm/__init__.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import AbstractLLMModel
|
2 |
+
from .registry import LLM_MODEL_REGISTRY, get_llm_model, register_llm_model
|
3 |
+
from .hf_pipeline import HFTextGenerationLLM
|
4 |
+
from .qwen import QwenLLM
|
5 |
+
|
6 |
+
__all__ = [
|
7 |
+
"AbstractLLMModel",
|
8 |
+
"get_llm_model",
|
9 |
+
"register_llm_model",
|
10 |
+
"LLM_MODEL_REGISTRY",
|
11 |
+
]
|
modules/llm/base.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
|
4 |
+
class AbstractLLMModel(ABC):
|
5 |
+
def __init__(
|
6 |
+
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
7 |
+
):
|
8 |
+
print(f"Loading LLM model {model_id}...")
|
9 |
+
self.model_id = model_id
|
10 |
+
self.device = device
|
11 |
+
self.cache_dir = cache_dir
|
12 |
+
|
13 |
+
@abstractmethod
|
14 |
+
def generate(self, prompt: str, **kwargs) -> str:
|
15 |
+
pass
|
modules/{llm.py → llm/hf_pipeline.py}
RENAMED
@@ -1,44 +1,21 @@
|
|
1 |
-
|
2 |
|
3 |
from transformers import pipeline
|
4 |
|
5 |
-
|
|
|
6 |
|
7 |
-
|
8 |
-
class AbstractLLMModel(ABC):
|
9 |
-
@abstractmethod
|
10 |
-
def __init__(
|
11 |
-
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
12 |
-
): ...
|
13 |
-
|
14 |
-
@abstractmethod
|
15 |
-
def generate(self, prompt: str, **kwargs) -> str:
|
16 |
-
pass
|
17 |
-
|
18 |
-
|
19 |
-
def register_llm_model(prefix: str):
|
20 |
-
def wrapper(cls):
|
21 |
-
assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel"
|
22 |
-
LLM_MODEL_REGISTRY[prefix] = cls
|
23 |
-
return cls
|
24 |
-
|
25 |
-
return wrapper
|
26 |
-
|
27 |
-
|
28 |
-
def get_llm_model(model_id: str, device="cpu", **kwargs) -> AbstractLLMModel:
|
29 |
-
for prefix, cls in LLM_MODEL_REGISTRY.items():
|
30 |
-
if model_id.startswith(prefix):
|
31 |
-
return cls(model_id, device=device, **kwargs)
|
32 |
-
raise ValueError(f"No LLM wrapper found for model: {model_id}")
|
33 |
|
34 |
|
35 |
-
@register_llm_model("
|
36 |
-
@register_llm_model("
|
37 |
-
@register_llm_model("meta-llama")
|
38 |
class HFTextGenerationLLM(AbstractLLMModel):
|
39 |
def __init__(
|
40 |
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
41 |
):
|
|
|
42 |
model_kwargs = kwargs.setdefault("model_kwargs", {})
|
43 |
model_kwargs["cache_dir"] = cache_dir
|
44 |
self.pipe = pipeline(
|
@@ -46,6 +23,8 @@ class HFTextGenerationLLM(AbstractLLMModel):
|
|
46 |
model=model_id,
|
47 |
device=0 if device == "cuda" else -1,
|
48 |
return_full_text=False,
|
|
|
|
|
49 |
**kwargs,
|
50 |
)
|
51 |
|
|
|
1 |
+
import os
|
2 |
|
3 |
from transformers import pipeline
|
4 |
|
5 |
+
from .base import AbstractLLMModel
|
6 |
+
from .registry import register_llm_model
|
7 |
|
8 |
+
hf_token = os.getenv("HF_TOKEN")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
+
@register_llm_model("openai-community/")
|
12 |
+
@register_llm_model("google/gemma-")
|
13 |
+
@register_llm_model("meta-llama/Llama-")
|
14 |
class HFTextGenerationLLM(AbstractLLMModel):
|
15 |
def __init__(
|
16 |
self, model_id: str, device: str = "cpu", cache_dir: str = "cache", **kwargs
|
17 |
):
|
18 |
+
super().__init__(model_id, device, cache_dir, **kwargs)
|
19 |
model_kwargs = kwargs.setdefault("model_kwargs", {})
|
20 |
model_kwargs["cache_dir"] = cache_dir
|
21 |
self.pipe = pipeline(
|
|
|
23 |
model=model_id,
|
24 |
device=0 if device == "cuda" else -1,
|
25 |
return_full_text=False,
|
26 |
+
token=hf_token,
|
27 |
+
trust_remote_code=True,
|
28 |
**kwargs,
|
29 |
)
|
30 |
|
modules/llm/registry.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import AbstractLLMModel
|
2 |
+
|
3 |
+
LLM_MODEL_REGISTRY = {}
|
4 |
+
|
5 |
+
|
6 |
+
def register_llm_model(prefix: str):
|
7 |
+
def wrapper(cls):
|
8 |
+
assert issubclass(cls, AbstractLLMModel), f"{cls} must inherit AbstractLLMModel"
|
9 |
+
LLM_MODEL_REGISTRY[prefix] = cls
|
10 |
+
return cls
|
11 |
+
|
12 |
+
return wrapper
|
13 |
+
|
14 |
+
|
15 |
+
def get_llm_model(model_id: str, device="cpu", **kwargs) -> AbstractLLMModel:
|
16 |
+
for prefix, cls in LLM_MODEL_REGISTRY.items():
|
17 |
+
if model_id.startswith(prefix):
|
18 |
+
return cls(model_id, device=device, **kwargs)
|
19 |
+
raise ValueError(f"No LLM wrapper found for model: {model_id}")
|
modules/melody.py
CHANGED
@@ -109,9 +109,10 @@ class MelodyController:
|
|
109 |
if pitch == 0:
|
110 |
score.append((st, ed, ref_lyric, pitch))
|
111 |
elif ref_lyric in ["-", "——"] and align_type == "lyric":
|
112 |
-
score.append((st, ed,
|
113 |
-
text_idx += 1
|
114 |
else:
|
115 |
score.append((st, ed, text_list[text_idx], pitch))
|
116 |
text_idx += 1
|
|
|
|
|
117 |
return score
|
|
|
109 |
if pitch == 0:
|
110 |
score.append((st, ed, ref_lyric, pitch))
|
111 |
elif ref_lyric in ["-", "——"] and align_type == "lyric":
|
112 |
+
score.append((st, ed, "-", pitch))
|
|
|
113 |
else:
|
114 |
score.append((st, ed, text_list[text_idx], pitch))
|
115 |
text_idx += 1
|
116 |
+
if text_idx >= len(text_list):
|
117 |
+
break
|
118 |
return score
|
modules/svs/base.py
CHANGED
@@ -13,6 +13,8 @@ class AbstractSVSModel(ABC):
|
|
13 |
def synthesize(
|
14 |
self,
|
15 |
score: list[tuple[float, float, str, int]],
|
|
|
|
|
16 |
**kwargs,
|
17 |
) -> tuple[np.ndarray, int]:
|
18 |
"""
|
|
|
13 |
def synthesize(
|
14 |
self,
|
15 |
score: list[tuple[float, float, str, int]],
|
16 |
+
language: str,
|
17 |
+
speaker: str,
|
18 |
**kwargs,
|
19 |
) -> tuple[np.ndarray, int]:
|
20 |
"""
|
modules/svs/espnet.py
CHANGED
@@ -53,7 +53,7 @@ class ESPNetSVS(AbstractSVSModel):
|
|
53 |
phoneme_mappers = {}
|
54 |
return phoneme_mappers
|
55 |
|
56 |
-
def _preprocess(self, score: list[tuple[float, float, str, int]], language: str):
|
57 |
if language not in self.phoneme_mappers:
|
58 |
raise ValueError(f"Unsupported language: {language} for {self.model_id}")
|
59 |
phoneme_mapper = self.phoneme_mappers[language]
|
@@ -90,20 +90,20 @@ class ESPNetSVS(AbstractSVSModel):
|
|
90 |
pre_phn = phn_units[-1]
|
91 |
|
92 |
batch = {
|
93 |
-
"score":
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
"text": " ".join(phns),
|
98 |
}
|
99 |
return batch
|
100 |
|
101 |
def synthesize(
|
102 |
-
self, score: list[tuple[float, float, str, int]], language: str, **kwargs
|
103 |
):
|
104 |
batch = self._preprocess(score, language)
|
105 |
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
106 |
-
sid = np.array([int(
|
107 |
output_dict = self.model(batch, sids=sid)
|
108 |
elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
|
109 |
langs = {
|
@@ -115,7 +115,7 @@ class ESPNetSVS(AbstractSVSModel):
|
|
115 |
f"Unsupported language: {language} for {self.model_id}"
|
116 |
)
|
117 |
lid = np.array([langs[language]])
|
118 |
-
spk_embed = np.load(
|
119 |
output_dict = self.model(batch, lids=lid, spembs=spk_embed)
|
120 |
else:
|
121 |
raise NotImplementedError(f"Model {self.model_id} not supported")
|
|
|
53 |
phoneme_mappers = {}
|
54 |
return phoneme_mappers
|
55 |
|
56 |
+
def _preprocess(self, score: list[tuple[float, float, str, int] | tuple[float, float, str, float]], language: str):
|
57 |
if language not in self.phoneme_mappers:
|
58 |
raise ValueError(f"Unsupported language: {language} for {self.model_id}")
|
59 |
phoneme_mapper = self.phoneme_mappers[language]
|
|
|
90 |
pre_phn = phn_units[-1]
|
91 |
|
92 |
batch = {
|
93 |
+
"score": (
|
94 |
+
120, # does not affect svs result, as note durations are in time unit
|
95 |
+
notes,
|
96 |
+
),
|
97 |
"text": " ".join(phns),
|
98 |
}
|
99 |
return batch
|
100 |
|
101 |
def synthesize(
|
102 |
+
self, score: list[tuple[float, float, str, float] | tuple[float, float, str, int]], language: str, speaker: str, **kwargs
|
103 |
):
|
104 |
batch = self._preprocess(score, language)
|
105 |
if self.model_id == "espnet/aceopencpop_svs_visinger2_40singer_pretrain":
|
106 |
+
sid = np.array([int(speaker)])
|
107 |
output_dict = self.model(batch, sids=sid)
|
108 |
elif self.model_id == "espnet/mixdata_svs_visinger2_spkemb_lang_pretrained":
|
109 |
langs = {
|
|
|
115 |
f"Unsupported language: {language} for {self.model_id}"
|
116 |
)
|
117 |
lid = np.array([langs[language]])
|
118 |
+
spk_embed = np.load(speaker)
|
119 |
output_dict = self.model(batch, lids=lid, spembs=spk_embed)
|
120 |
else:
|
121 |
raise NotImplementedError(f"Model {self.model_id} not supported")
|
modules/utils/g2p.py
CHANGED
@@ -32,6 +32,7 @@ for plan in ace_phonemes_all_plans["plans"]:
|
|
32 |
|
33 |
|
34 |
def preprocess_text(text: str, language: str) -> list[str]:
|
|
|
35 |
if language == "mandarin":
|
36 |
text_list = to_pinyin(text)
|
37 |
elif language == "japanese":
|
|
|
32 |
|
33 |
|
34 |
def preprocess_text(text: str, language: str) -> list[str]:
|
35 |
+
text = text.replace(" ", "")
|
36 |
if language == "mandarin":
|
37 |
text_list = to_pinyin(text)
|
38 |
elif language == "japanese":
|
modules/utils/text_normalize.py
CHANGED
@@ -3,12 +3,13 @@ from typing import Optional
|
|
3 |
|
4 |
|
5 |
def remove_non_zh_jp(text: str) -> str:
|
6 |
-
pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef]"
|
7 |
return re.sub(pattern, "", text)
|
8 |
|
9 |
|
10 |
def truncate_sentences(text: str, max_sentences: int) -> str:
|
11 |
-
sentences = re.split(r"(?<=[
|
|
|
12 |
return "".join(sentences[:max_sentences]).strip()
|
13 |
|
14 |
|
|
|
3 |
|
4 |
|
5 |
def remove_non_zh_jp(text: str) -> str:
|
6 |
+
pattern = r"[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\uff01-\uffef\s]"
|
7 |
return re.sub(pattern, "", text)
|
8 |
|
9 |
|
10 |
def truncate_sentences(text: str, max_sentences: int) -> str:
|
11 |
+
sentences = re.split(r"(?<=[。!?!?~])|(?:\n+)|(?: {2,})", text)
|
12 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
13 |
return "".join(sentences[:max_sentences]).strip()
|
14 |
|
15 |
|
pipeline.py
CHANGED
@@ -1,6 +1,11 @@
|
|
1 |
-
import
|
|
|
2 |
import time
|
|
|
|
|
3 |
import librosa
|
|
|
|
|
4 |
|
5 |
from modules.asr import get_asr_model
|
6 |
from modules.llm import get_llm_model
|
@@ -29,6 +34,7 @@ class SingingDialoguePipeline:
|
|
29 |
self.melody_controller = MelodyController(
|
30 |
config["melody_source"], self.cache_dir
|
31 |
)
|
|
|
32 |
self.track_latency = config.get("track_latency", False)
|
33 |
self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
|
34 |
|
@@ -55,8 +61,9 @@ class SingingDialoguePipeline:
|
|
55 |
audio_path,
|
56 |
language,
|
57 |
prompt_template,
|
58 |
-
|
59 |
-
|
|
|
60 |
):
|
61 |
if self.track_latency:
|
62 |
asr_start_time = time.time()
|
@@ -75,13 +82,14 @@ class SingingDialoguePipeline:
|
|
75 |
if self.track_latency:
|
76 |
llm_end_time = time.time()
|
77 |
llm_latency = llm_end_time - llm_start_time
|
78 |
-
|
79 |
-
|
|
|
80 |
score = self.melody_controller.generate_score(llm_response, language)
|
81 |
if self.track_latency:
|
82 |
svs_start_time = time.time()
|
83 |
singing_audio, sample_rate = self.svs.synthesize(
|
84 |
-
score, language=language,
|
85 |
)
|
86 |
if self.track_latency:
|
87 |
svs_end_time = time.time()
|
@@ -89,15 +97,19 @@ class SingingDialoguePipeline:
|
|
89 |
results = {
|
90 |
"asr_text": asr_result,
|
91 |
"llm_text": llm_response,
|
92 |
-
"svs_audio": (
|
93 |
}
|
|
|
|
|
|
|
|
|
94 |
if self.track_latency:
|
95 |
-
results["metrics"]
|
96 |
"asr_latency": asr_latency,
|
97 |
"llm_latency": llm_latency,
|
98 |
"svs_latency": svs_latency,
|
99 |
-
}
|
100 |
return results
|
101 |
|
102 |
-
def evaluate(self,
|
103 |
-
return run_evaluation(
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
import time
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
import librosa
|
7 |
+
import soundfile as sf
|
8 |
+
import torch
|
9 |
|
10 |
from modules.asr import get_asr_model
|
11 |
from modules.llm import get_llm_model
|
|
|
34 |
self.melody_controller = MelodyController(
|
35 |
config["melody_source"], self.cache_dir
|
36 |
)
|
37 |
+
self.max_sentences = config.get("max_sentences", 2)
|
38 |
self.track_latency = config.get("track_latency", False)
|
39 |
self.evaluators = load_evaluators(config.get("evaluators", {}).get("svs", []))
|
40 |
|
|
|
61 |
audio_path,
|
62 |
language,
|
63 |
prompt_template,
|
64 |
+
speaker,
|
65 |
+
output_audio_path: Path | str = None,
|
66 |
+
max_new_tokens=50,
|
67 |
):
|
68 |
if self.track_latency:
|
69 |
asr_start_time = time.time()
|
|
|
82 |
if self.track_latency:
|
83 |
llm_end_time = time.time()
|
84 |
llm_latency = llm_end_time - llm_start_time
|
85 |
+
llm_response = clean_llm_output(
|
86 |
+
output, language=language, max_sentences=self.max_sentences
|
87 |
+
)
|
88 |
score = self.melody_controller.generate_score(llm_response, language)
|
89 |
if self.track_latency:
|
90 |
svs_start_time = time.time()
|
91 |
singing_audio, sample_rate = self.svs.synthesize(
|
92 |
+
score, language=language, speaker=speaker
|
93 |
)
|
94 |
if self.track_latency:
|
95 |
svs_end_time = time.time()
|
|
|
97 |
results = {
|
98 |
"asr_text": asr_result,
|
99 |
"llm_text": llm_response,
|
100 |
+
"svs_audio": (sample_rate, singing_audio),
|
101 |
}
|
102 |
+
if output_audio_path:
|
103 |
+
Path(output_audio_path).parent.mkdir(parents=True, exist_ok=True)
|
104 |
+
sf.write(output_audio_path, singing_audio, sample_rate)
|
105 |
+
results["output_audio_path"] = output_audio_path
|
106 |
if self.track_latency:
|
107 |
+
results["metrics"] = {
|
108 |
"asr_latency": asr_latency,
|
109 |
"llm_latency": llm_latency,
|
110 |
"svs_latency": svs_latency,
|
111 |
+
}
|
112 |
return results
|
113 |
|
114 |
+
def evaluate(self, audio_path):
|
115 |
+
return run_evaluation(audio_path, self.evaluators)
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
git+https://github.com/espnet/espnet
|
2 |
espnet_model_zoo
|
3 |
-
|
4 |
datasets
|
5 |
torchaudio
|
6 |
typeguard==4.4.0
|
@@ -15,3 +15,6 @@ transformers
|
|
15 |
s3prl
|
16 |
zhconv
|
17 |
git+https://github.com/sea-turt1e/kanjiconv
|
|
|
|
|
|
|
|
1 |
git+https://github.com/espnet/espnet
|
2 |
espnet_model_zoo
|
3 |
+
pyopenjtalk
|
4 |
datasets
|
5 |
torchaudio
|
6 |
typeguard==4.4.0
|
|
|
15 |
s3prl
|
16 |
zhconv
|
17 |
git+https://github.com/sea-turt1e/kanjiconv
|
18 |
+
soundfile
|
19 |
+
PyYAML
|
20 |
+
gradio
|
tests/__init__.py
ADDED
File without changes
|
tests/audio/chat.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:181a7f27f8acb00cba0276d0ff88759120a76eebd47b4e0a60c2424e43e5cbaf
|
3 |
+
size 271030
|
tests/audio/feeling.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fef036c2bf0ddf635a004845e94c89d0658f754a53e12fadbb50511d3cd6c15
|
3 |
+
size 263502
|
tests/audio/hello.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa7e839d32f7bda77cad11fc13fd1b92df939479612dd5af079d8f9b19598c0d
|
3 |
+
size 263502
|
tests/audio/interesting.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a1618f73d90ad068d5eb72455ac812b49fcb9e44e88af5e67ef88f5c6ddb74a
|
3 |
+
size 429086
|
tests/audio/music.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6388b587e282e8f6457b629b5cbb9fd50c5cb6a7f90c446329a3f23be8b1442c
|
3 |
+
size 286082
|
tests/audio/where_from.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ef81772b96813216d7b14d3d70a39b040e9c542d896d9337f8975f8fd6da96e
|
3 |
+
size 195766
|
tests/test_llm_infer.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from modules.llm import get_llm_model
|
2 |
+
|
3 |
+
if __name__ == "__main__":
|
4 |
+
supported_llms = [
|
5 |
+
# "MiniMaxAI/MiniMax-M1-80k", #-》load with custom code
|
6 |
+
# "Qwen/Qwen-1_8B",
|
7 |
+
# "meta-llama/Llama-3.1-8B-Instruct", # pending for approval
|
8 |
+
# "tiiuae/Falcon-H1-1B-Base",
|
9 |
+
# "tiiuae/Falcon-H1-3B-Instruct",
|
10 |
+
# "tencent/Hunyuan-A13B-Instruct", # -> load with custom code
|
11 |
+
# "deepseek-ai/DeepSeek-R1-0528",
|
12 |
+
# "openai-community/gpt2-xl",
|
13 |
+
# "google/gemma-2-2b",
|
14 |
+
]
|
15 |
+
for model_id in supported_llms:
|
16 |
+
try:
|
17 |
+
print(f"Loading model: {model_id}")
|
18 |
+
llm = get_llm_model(model_id, cache_dir="./.cache")
|
19 |
+
prompt = "你好,今天你心情怎么样?"
|
20 |
+
result = llm.generate(prompt)
|
21 |
+
print(f"=================")
|
22 |
+
print(f"[{model_id}] LLM inference result:", result)
|
23 |
+
except Exception as e:
|
24 |
+
print(f"Failed to load model {model_id}: {e}")
|
25 |
+
breakpoint()
|
26 |
+
continue
|