Spaces:
Sleeping
Sleeping
Add CLI functionality and restructure configuration files
Browse files- app.py +2 -1
- cli.py +46 -0
- config/cli/limei_default.yaml +16 -0
- config/cli/yaoyin_default.yaml +16 -0
- config/{default.yaml → interface/default.yaml} +0 -0
- config/interface/options.yaml +63 -0
- requirements.txt +1 -0
app.py
CHANGED
@@ -3,7 +3,8 @@ from interface import GradioInterface
|
|
3 |
|
4 |
def main():
|
5 |
demo = GradioInterface(
|
6 |
-
options_config="config/options.yaml",
|
|
|
7 |
).create_interface()
|
8 |
demo.launch()
|
9 |
|
|
|
3 |
|
4 |
def main():
|
5 |
demo = GradioInterface(
|
6 |
+
options_config="config/interface/options.yaml",
|
7 |
+
default_config="config/interface/default.yaml",
|
8 |
).create_interface()
|
9 |
demo.launch()
|
10 |
|
cli.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from argparse import ArgumentParser
|
2 |
+
from logging import getLogger
|
3 |
+
|
4 |
+
import soundfile as sf
|
5 |
+
import yaml
|
6 |
+
|
7 |
+
from characters import CHARACTERS
|
8 |
+
from pipeline import SingingDialoguePipeline
|
9 |
+
|
10 |
+
logger = getLogger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
def get_parser():
|
14 |
+
parser = ArgumentParser()
|
15 |
+
parser.add_argument("--query_audio", type=str, required=True)
|
16 |
+
parser.add_argument("--config_path", type=str, required=True)
|
17 |
+
parser.add_argument("--output_audio", type=str, required=True)
|
18 |
+
return parser
|
19 |
+
|
20 |
+
|
21 |
+
def load_config(config_path: str):
|
22 |
+
with open(config_path, "r") as f:
|
23 |
+
config = yaml.safe_load(f)
|
24 |
+
return config
|
25 |
+
|
26 |
+
|
27 |
+
def main():
|
28 |
+
parser = get_parser()
|
29 |
+
args = parser.parse_args()
|
30 |
+
config = load_config(args.config_path)
|
31 |
+
pipeline = SingingDialoguePipeline(config)
|
32 |
+
speaker = config["speaker"]
|
33 |
+
language = config["language"]
|
34 |
+
character_name = config["prompt_template_character"]
|
35 |
+
character = CHARACTERS[character_name]
|
36 |
+
prompt_template = character.prompt
|
37 |
+
results = pipeline.run(args.query_audio, language, prompt_template, speaker)
|
38 |
+
logger.info(
|
39 |
+
f"Input: {args.query_audio}, Output: {args.output_audio}, ASR results: {results['asr_text']}, LLM results: {results['llm_text']}"
|
40 |
+
)
|
41 |
+
svs_audio, svs_sample_rate = results["svs_audio"]
|
42 |
+
sf.write(args.output_audio, svs_audio, svs_sample_rate)
|
43 |
+
|
44 |
+
|
45 |
+
if __name__ == "__main__":
|
46 |
+
main()
|
config/cli/limei_default.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_model: openai/whisper-large-v3-turbo
|
2 |
+
llm_model: google/gemma-2-2b
|
3 |
+
svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
4 |
+
melody_source: sample-lyric-kising
|
5 |
+
language: mandarin
|
6 |
+
prompt_template_character: Limei
|
7 |
+
speaker: 5
|
8 |
+
cache_dir: .cache
|
9 |
+
|
10 |
+
track_latency: True
|
11 |
+
evaluators:
|
12 |
+
svs:
|
13 |
+
- singmos
|
14 |
+
- per
|
15 |
+
- melody
|
16 |
+
- aesthetic
|
config/cli/yaoyin_default.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_model: openai/whisper-large-v3-turbo
|
2 |
+
llm_model: google/gemma-2-2b
|
3 |
+
svs_model: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
4 |
+
melody_source: sample-lyric-kising
|
5 |
+
language: mandarin
|
6 |
+
prompt_template_character: Yaoyin
|
7 |
+
speaker: 9
|
8 |
+
cache_dir: .cache
|
9 |
+
|
10 |
+
track_latency: True
|
11 |
+
evaluators:
|
12 |
+
svs:
|
13 |
+
- singmos
|
14 |
+
- per
|
15 |
+
- melody
|
16 |
+
- aesthetic
|
config/{default.yaml → interface/default.yaml}
RENAMED
File without changes
|
config/interface/options.yaml
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
asr_models:
|
2 |
+
- id: openai/whisper-large-v3-turbo
|
3 |
+
name: Whisper large-v3-turbo
|
4 |
+
- id: openai/whisper-large-v3
|
5 |
+
name: Whisper large-v3
|
6 |
+
- id: openai/whisper-medium
|
7 |
+
name: Whisper medium
|
8 |
+
- id: sanchit-gandhi/whisper-small-dv
|
9 |
+
name: Whisper small-dv
|
10 |
+
- id: facebook/wav2vec2-base-960h
|
11 |
+
name: Wav2Vec2-Base-960h
|
12 |
+
|
13 |
+
llm_models:
|
14 |
+
- id: google/gemma-2-2b
|
15 |
+
name: Gemma 2 2B
|
16 |
+
- id: MiniMaxAI/MiniMax-M1-80k
|
17 |
+
name: MiniMax M1 80k
|
18 |
+
|
19 |
+
svs_models:
|
20 |
+
- id: mandarin-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
21 |
+
name: Visinger2 (Bilingual)-zh
|
22 |
+
model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
23 |
+
lang: mandarin
|
24 |
+
voices:
|
25 |
+
voice1: resource/singer/singer_embedding_ace-2.npy
|
26 |
+
voice2: resource/singer/singer_embedding_ace-8.npy
|
27 |
+
voice3: resource/singer/singer_embedding_itako.npy
|
28 |
+
voice4: resource/singer/singer_embedding_kising_orange.npy
|
29 |
+
voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
|
30 |
+
- id: japanese-espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
31 |
+
name: Visinger2 (Bilingual)-jp
|
32 |
+
model_path: espnet/mixdata_svs_visinger2_spkemb_lang_pretrained
|
33 |
+
lang: japanese
|
34 |
+
voices:
|
35 |
+
voice1: resource/singer/singer_embedding_ace-2.npy
|
36 |
+
voice2: resource/singer/singer_embedding_ace-8.npy
|
37 |
+
voice3: resource/singer/singer_embedding_itako.npy
|
38 |
+
voice4: resource/singer/singer_embedding_kising_orange.npy
|
39 |
+
voice5: resource/singer/singer_embedding_m4singer_Alto-4.npy
|
40 |
+
- id: mandarin-espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
41 |
+
name: Visinger2 (Chinese)
|
42 |
+
model_path: espnet/aceopencpop_svs_visinger2_40singer_pretrain
|
43 |
+
lang: mandarin
|
44 |
+
voices:
|
45 |
+
voice1: 5
|
46 |
+
voice2: 8
|
47 |
+
voice3: 12
|
48 |
+
voice4: 15
|
49 |
+
voice5: 29
|
50 |
+
|
51 |
+
melody_sources:
|
52 |
+
- id: gen-random-none
|
53 |
+
name: Random Generation
|
54 |
+
desc: "Melody is generated without any structure or reference."
|
55 |
+
- id: sample-note-kising
|
56 |
+
name: Sampled Melody (KiSing)
|
57 |
+
desc: "Melody is retrieved from KiSing dataset."
|
58 |
+
- id: sample-note-touhou
|
59 |
+
name: Sampled Melody (Touhou)
|
60 |
+
desc: "Melody is retrieved from Touhou dataset."
|
61 |
+
- id: sample-lyric-kising
|
62 |
+
name: Sampled Melody with Lyrics (Kising)
|
63 |
+
desc: "Melody with aligned lyrics are sampled from Kising dataset."
|
requirements.txt
CHANGED
@@ -17,3 +17,4 @@ zhconv
|
|
17 |
git+https://github.com/sea-turt1e/kanjiconv
|
18 |
soundfile
|
19 |
PyYAML
|
|
|
|
17 |
git+https://github.com/sea-turt1e/kanjiconv
|
18 |
soundfile
|
19 |
PyYAML
|
20 |
+
gradio
|